Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
STEP 1 : Data Loading¶
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(data_url,sep =';')
STEP 2 : Exploring the Data¶
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
features = data.columns[:-1].values
labels = [data.columns[-1]]
print('Features List: ',features)
print('Labels List: ',labels)
Features List: ['fixed acidity' 'volatile acidity' 'citric acid' 'residual sugar' 'chlorides' 'free sulfur dioxide' 'total sulfur dioxide' 'density' 'pH' 'sulphates' 'alcohol'] Labels List: ['quality']
data.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
print(data['quality'].value_counts())
5 681 6 638 7 199 4 53 8 18 3 10 Name: quality, dtype: int64
Visualization
sns.set()
data.quality.hist()
plt.xlabel('Wine Quality')
plt.ylabel('Count')
plt.show()
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x1ed204f2df0>
TRAIN-TEST SPLIT¶
Writing a function to split the data into training and test. Make sure to set the seed so that we get the same test set in the next run.
def split_train_test(data, test_ratio):
#set the random seed
np.random.seed(42)
#shuffle the dataset
shuffled_indices = np.random.permutation(len(data))
#calculate the size of the test set.
test_set_size = int(len(data)*test_ratio)
#split dataset to get trainning and test sets.
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(data, 0.2)
from sklearn.model_selection import train_test_split
train_set, testa_set = train_test_split(data, test_size=0.2, random_state=42)
STRATIFIED - SHUFFLE SPLIT¶
from re import split
from sklearn.model_selection import StratifiedShuffleSplit
split_data= StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split_data.split(data, data["quality"]):
strat_train_set= data.loc[train_index]
strat_test_set=data.loc[test_index]
strat_dist= strat_test_set['quality'].value_counts() / len(strat_test_set)
overall_dist= data['quality'].value_counts() / len(data)
DISTRIBUTION - COMPARISION¶
dist_comparison = pd.DataFrame({'overall': overall_dist, 'stratified': strat_dist})
dist_comparison['diff(s-o'] = dist_comparison['stratified'] - dist_comparison['overall']
dist_comparison['diff(s-o)_pct'] = 100 * (dist_comparison['diff(s-o']/dist_comparison['overall'])
print(dist_comparison)
overall stratified diff(s-o diff(s-o)_pct 5 0.425891 0.425000 -0.000891 -0.209251 6 0.398999 0.400000 0.001001 0.250784 7 0.124453 0.125000 0.000547 0.439698 4 0.033146 0.034375 0.001229 3.708726 8 0.011257 0.009375 -0.001882 -16.718750 3 0.006254 0.006250 -0.000004 -0.062500
#let's contrast this with random sampling
random_dist = test_set['quality'].value_counts()/len(test_set)
random_dist
6 0.413793 5 0.407524 7 0.131661 4 0.028213 8 0.015674 3 0.003135 Name: quality, dtype: float64
Compare the difference in distribution of stratified and uniform sampling: stratified sampling gives us test distribution closer to the overall distribution than the random sampling.
STEP 3 : Data Visualization¶
performed on training set in case of large data set.
sample examples to form exploration set
Enables to understand features and their relationship among themselves and with output labels.
In our case we have a small training data and we use it all for data exploration. There is no need to create a separate exploration set.
It's good idea to create a copy of the training set so that we can freely manipulate it without worrying about any manipulation in the original set.
exploration_set= strat_train_set.copy()
1. Scatter plot¶
sns.scatterplot(x='fixed acidity' , y='density' ,hue ='quality', data=exploration_set)
plt.show()
exploration_set.plot(kind='scatter', x='fixed acidity', y='density', alpha=0.5,c='quality',cmap=plt.get_cmap('jet'))
plt.show()
2. Standard correlation coefficient between features.¶
- Ranges between -1 to +1
- Correlation = +1 means Strong positive correlation between features
- Correlation = -1 means Strong negative correlation between features
- Correlation = 0 means No linear correlation between features
- Visualizaiton with heatmap only captures linear relationship between features
- For non-linear relationship, we use rank correlation
corr_matrix = exploration_set.corr()
Checking features that are correlated with the label,i.e quality in our case.
corr_matrix['quality'].sort_values(ascending=False)
quality 1.000000 alcohol 0.481197 sulphates 0.228050 citric acid 0.210802 fixed acidity 0.107940 residual sugar 0.003710 free sulfur dioxide -0.048291 pH -0.052063 chlorides -0.120231 density -0.193009 total sulfur dioxide -0.194511 volatile acidity -0.383249 Name: quality, dtype: float64
Notice that quality has strong positive correlation with alcohol content [0.48] and strong negative correlation with volitile acidity [-0.38]
Visualization of correlation matix using Heatmap :
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, yticklabels=True,cbar=True,annot=True)
<AxesSubplot:>
We can notice:
- The correlation coefficient on diagonal is +1.
- Darker colors represent negative correlations, while fainer colors denote positive correlations. For example :
- citric acid and fixed acidity have strong positive correlation.
- pH and fixed acidity have strong negative correlation.
Another option to visualize the relationship between the feature is with scatter matrix.
from pandas.plotting import scatter_matrix
attribute_list = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides']
scatter_matrix(exploration_set[attribute_list], figsize=(12,8))
plt.show()
For convenience of visualization, we show it for a small number of attributes/features.
Similar analysis can be carried out with combined features-features that are derived from the original features.
Notes of wisdom
Visualization and data exploration do not have to be absolutely thorough.
Objective is to get quick insight into features and its relatioship with other features and labels.
Exploration is an iterative process: Once we build model and obtain more insights, we can come back to this step.
STEP 4 : Prepare data for ML algorithm¶
We often need to preprocess the data before using it for model building due to variety of reasons.
Due to errors in data capture, data may contain outliers or missing values
Different features may be at different scales.
The current data distribution is not exactly amenable to learning.
Typical steps in data preprocessing are as follows :
Separate features and labels.
Handling missing values and outliers
Feature scaling to bring all features on the same scale.
Applying certain transformations like log, square root on the features.
It is a good practive to make a copy of the data and apply preprocessing on that copy.
This ensures that in case something goes wrong, we will at least have original copy of the data intact.
1. Separation of features and labels¶
# Copy all features leaving aside the label.
wine_features = strat_train_set.drop('quality', axis=1)
wine_labels = strat_train_set['quality'].copy()
2. Data Cleaning¶
2.A. Handling missing values¶
First check if there are any missing values in feature set. One way to find that out is column-wise.
# counts the number of Nan in each column of wine_features
wine_features.isna().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 dtype: int64
In case, we have non-zero numbers in any columns, we have a problem of missing values
These values are missing due to errors in recording or they do not exist.
if they are not recorded:
use imputation technique to fill up the missing values
Drop the rows containig missing values
if they do exists, it is better to keep it as NaN.
Sklearn provides the following methods to drop rows conatining missing values:
1. dropna()
2. drop()
If provides SimpleImputer class for filling up missing values with say, median value.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(wine_features)
SimpleImputer(strategy='median')
In case, the features contains non-numeric attributes, they need to be dropped before calling the fit method on imputer object.
Let's check the statistics learnt by the imputer on the training set:
imputer.statistics_
array([ 7.9 , 0.52 , 0.26 , 2.2 , 0.08 , 14. ,
39. , 0.99675, 3.31 , 0.62 , 10.2 ])
Note that these are median values for each feature. We can cross-check it by calculating median on the feature set:
wine_features.median()
fixed acidity 7.90000 volatile acidity 0.52000 citric acid 0.26000 residual sugar 2.20000 chlorides 0.08000 free sulfur dioxide 14.00000 total sulfur dioxide 39.00000 density 0.99675 pH 3.31000 sulphates 0.62000 alcohol 10.20000 dtype: float64
Finally we use the trained imputer to transform the training set such that the missing values are replaced by the medians.
transf_features = imputer.transform(wine_features)
transf_features.shape
(1279, 11)
This returns a Numpy array and we can convert it to the dataframe if needed:
wine_features_transf = pd.DataFrame(transf_features, columns=wine_features.columns)
wine_features_transf.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.7 | 0.855 | 0.02 | 1.9 | 0.064 | 29.0 | 38.0 | 0.99472 | 3.30 | 0.56 | 10.75 |
| 1 | 6.9 | 0.630 | 0.33 | 6.7 | 0.235 | 66.0 | 115.0 | 0.99787 | 3.22 | 0.56 | 9.50 |
| 2 | 11.9 | 0.570 | 0.50 | 2.6 | 0.082 | 6.0 | 32.0 | 1.00060 | 3.12 | 0.78 | 10.70 |
| 3 | 8.6 | 0.470 | 0.27 | 2.3 | 0.055 | 14.0 | 28.0 | 0.99516 | 3.18 | 0.80 | 11.20 |
| 4 | 10.4 | 0.260 | 0.48 | 1.9 | 0.066 | 6.0 | 10.0 | 0.99724 | 3.33 | 0.87 | 10.90 |
2.B. Handling text and categorical attributes¶
ORDINAL ENCODING :
Converts categories to numbers
Call
fit_transform()method on ordinal_encoder object to convert text to numbers.The list of categories can be obtained via
categories_instance variable.
One issue with this representation is that the ML algorithm would assume that the two nearby values are closer than the distinct ones
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
ONE-HOT ENCODING :
Converts categorical variables to binary variables.
In other words, we create one binary feature per category - the feature value is 1 when the category is present, else it is 0.
One feature is 1 (hot) and the rest are 0 (cold).
The new features are referred to as dummy features. Scikt-Learn provides a
OneHotEncoderclass to convert categorical values into one-hot vectors.
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
We need to call
fit_transform()method on OneHotEncoder object.The output is a SciPy sparse matrix rather than NumPy array. This enables us to save space when we have a huge nuber of categories.
In case we want to convert it to dense reprersentation, we can do with
toarray()method.The list of categories can be obtained via
categories_instance variableAs we observed that when the number of categories are very large, the one-hot encoding would result in a very large number of features.
This can be addressed with one of the following approaches:
Replace with categorical numberical features
Convert into low-dimensional learnable vectors called
embeddings
3.Feature Scaling¶
Most ML algorithms do not perform well when input features are on very different scales.
Scaling of target label is generally not required.
3.A. Min-Max Scaling or Normalization¶
Scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1.
We subtract minimum value of a feature from the current value and divide it by the difference between minimum and the maximum value of that feature.
Scikit-Lean provides
MinMaxScalartransformer for this.One can specify hyperparameter
feature_rangeto specify the range of the feature.
3.B. Standardization¶
Scaling technique where the values are centered around the mean with a unit standard deviation.
We subtract mean value of each featurer from the current value and divide it by the standard deviation so that the resulting feature has a unit variance.
While
normalizationbounds values between 0 and 1,standardizationdoes not bound values to a specific range.Standardization is less affected by the outliers compared to the normalization.
Scikit-Learn provides
StandardScalartransformation for features standardization.Note that all these transformers are learnt on the traning data and then applied on the training and test data to transform them.
Never learn these transformers on the full dataset
Transformation Pipeline¶
Scikit-Learn provides a Pipeline class to line up transformations in an intended order.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
estimators = [('SimpleImputer', SimpleImputer()), ('StandardScaler', StandardScaler())]
pipe = Pipeline(steps=estimators)
transform_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('standardscaler', StandardScaler())])
wine_features_tr = transform_pipeline.fit_transform(wine_features)
Let's understand what is happening here:
Pipelinehas a sequence of transformations-missing value imputation followed by standardization.Each step is the sequence is define by name,estimator pair.
Each name should be unique and should not contain __ (double underscore)
The output of one step is passed on the next one in sequence until it reaches the last step.
Here the pipeline first performs imputation of missing values and its result is passed for standardization.
The pipeline exposes the same method as the final estimator.
- Here StandardScalar is the last estimator and since it is a transformer, we call
fit_transform()method on thePiplelineobject.
- Here StandardScalar is the last estimator and since it is a transformer, we call
Transforming Mixed Features¶
The real world data has both categorical as well as numerical features and we need to apply different transformations to them.
Scikit-Learn introduced
ColumnTransformerclass to handle this.
from sklearn.compose import ColumnTransformer
The
ColumnTransformerapplies each transformation to the appropriate columns and then concatenates the outputs along the columns.Note that all transformers must return the same number of rows.
The numeric transformers return dense matrix while the categorical ones return sparse matrix.
The ColumnTransformer automatically determines the type of the output based on the density of resulting matrix.
STEP 5 : Select and Train ML model¶
It is a good practice to build a quick baseline model on the preprocessed data and get an idea about model performance.
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(wine_features_tr, wine_labels)
LinearRegression()
Now that we have a working model of a regression, let's evaluate performance of the model on training as well as test sets.
For regression models, we use mean squared error as an evaluation measure.
from sklearn.metrics import mean_squared_error
quality_pred = lin_reg.predict(wine_features_tr)
mean_squared_error(wine_labels, quality_pred)
0.4206571060060277
Let's evaluate performance on the test set.
We need to first apply transformation on the test set and then apply the model prediction function.
# copy all features leaving aside the label.
wine_features_test = strat_test_set.drop("quality", axis=1)
#copy the label list
wine_labels_test = strat_test_set['quality'].copy()
#apply transformations
wine_features_test_tr = transform_pipeline.fit_transform(wine_features_test)
#call predict function and calculate MSE.
quality_test_pred = lin_reg.predict(wine_features_test_tr)
mean_squared_error(wine_labels_test, quality_test_pred)
0.3975913087501518
Let's visualize the error between the actual and predicted values.
plt.scatter(wine_labels_test, quality_test_pred)
plt.plot(wine_labels_test, wine_labels_test, 'b-')
plt.xlabel('Actual quality')
plt.ylabel('Predicted quality')
plt.show()
Let's try another model: DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(wine_features_tr, wine_labels)
DecisionTreeRegressor()
quality_pred = tree_reg.predict(wine_features_tr)
print('Training Error :', mean_squared_error(wine_labels, quality_pred))
quality_test_pred = tree_reg.predict(wine_features_test_tr)
print('Test Error : ',mean_squared_error(wine_labels_test, quality_test_pred))
Training Error : 0.0 Test Error : 0.58125
Note that the training error is 0, while the test error is 0.58. This is an example of an overfitted model.
plt.scatter(wine_labels_test, quality_test_pred)
plt.plot(wine_labels_test, wine_labels_test, 'r-')
plt.xlabel('Actual quality')
plt.ylabel('Predicted quality')
plt.show()
Cross-Validation (CV)¶
Cross validation provides a separate MSE for each validation set, which we can use to get a mean estimation of MSE as well as the standard deviation, which helps us to determine how precise is the estimate.
The additional cost we pay in cross validation is additional training runs, which may be too expensive in certain cases.
from sklearn.model_selection import cross_val_score
def display_scores(scores):
print("Scores : \n", scores)
print()
print("Mean : ", scores.mean())
print("Standard deviation : ", scores.std())
Linear Regression CV
scores = cross_val_score(lin_reg, wine_features_tr,
wine_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_mse_scores = -scores
display_scores(lin_reg_mse_scores)
Scores : [0.56364537 0.4429824 0.38302744 0.40166681 0.29687635 0.37322622 0.33184855 0.50182048 0.51661311 0.50468542] Mean : 0.431639217212196 Standard deviation : 0.08356359730413969
DecisionTreeRegressor CV
scores = cross_val_score(tree_reg, wine_features_tr,
wine_labels, scoring="neg_mean_squared_error", cv=10)
tree_mse_scores = -scores
display_scores(tree_mse_scores)
Scores : [0.6171875 0.6875 0.6328125 0.5078125 0.4609375 0.640625 0.65625 0.7109375 0.859375 1.07874016] Mean : 0.6852177657480315 Standard deviation : 0.16668343331737054
Upon comparision of scores of the two models, we can see that the LinearRegressor has better MSE and more precise estimation compared to DecisionTree.
RandomForest CV
Random forest model builds multiple decision trees on randomly selected features and then average their predictions.
Building a model on top of other model is called * ensemble learning * Which is often used to improve performance of ML models.
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
rf_reg.fit(wine_features_tr, wine_labels)
scores = cross_val_score(rf_reg, wine_features_tr,
wine_labels, scoring="neg_mean_squared_error", cv=10)
rf_mse_scores = -scores
display_scores(rf_mse_scores)
Scores : [0.36989922 0.41363672 0.29063438 0.31722344 0.21798125 0.30233828 0.27124922 0.38747344 0.42379219 0.46229449] Mean : 0.34565226131889765 Standard deviation : 0.0736322184302973
quality_pred = rf_reg.predict(wine_features_tr)
print('Training Error :', mean_squared_error(wine_labels, quality_pred))
quality_test_pred = rf_reg.predict(wine_features_test_tr)
print('Test Error : ',mean_squared_error(wine_labels_test, quality_test_pred))
Training Error : 0.04695613760750586 Test Error : 0.34449875
Random forest looks more promising than the other two models.
It's good practice to build a few such models quickly without tuning their hyperparameters and shortlist a few promising models among them.
Also save the methods to the disk in Python
pickleformat.
STEP 6 : FineTune the model¶
Usually there are a number of hyperparameters in the model, which are set manually.
Tuning these hyperparameters lead to better accuracy of ML models.
Finding the best combination of hyperparameters is a search problem in the space of hyperparameters, which is huge.
Grid Search¶
from sklearn.model_selection import GridSearchCV
We need to specify a list of hyperparameters along with the range of values to try.
It automatically evaluates all possible combinations of hyperparameter values using cross-validation.
For example, there are number of hyperparameters in RandomForest regression such as:
Number of estimators
Maximum number of features
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
Here the parameter grid contains two combinations:
The first combination contains n_estimators with 3 values and max_features with 4 values.
The second combination has an additional bootstrap parameter, which is set to False. Note that it was set to its default value, which is True, in the first grid.
Let's compute the total combinations evaluated here:
The first one results in 3 × 4 = 12 combinations.
The second one has 2 values of n_estimators and 3 values of max_features, thus resulting 2 × 3 = 6 in total of values.
The total number of combinations evaluated by the parameter grid 12 + 6 = 18.
Let's create an object of GridSearchCV:
grid_search = GridSearchCV(rf_reg, param_grid, cv=5,
scoring='neg_mean_squared_error', return_train_score=True)
In this case, we set cv=5 i.e. using 5 fold cross validation for training the model.
We need to train the model for 18 parameter combinations and each combination would be trained 5 times as we are using cross-validation here.
The total model training runs = 18 × 5 = 90
grid_search.fit(wine_features_tr, wine_labels)
GridSearchCV(cv=5, estimator=RandomForestRegressor(),
param_grid=[{'max_features': [2, 4, 6, 8],
'n_estimators': [3, 10, 30]},
{'bootstrap': [False], 'max_features': [2, 3, 4],
'n_estimators': [3, 10]}],
return_train_score=True, scoring='neg_mean_squared_error')
The best parameter combination can be obtained as follows:
grid_search.best_params_
{'max_features': 6, 'n_estimators': 30}
Let's find out the error at different parameter settings:
cv_res = grid_search.cv_results_
for mean_score, params in zip(cv_res["mean_test_score"], cv_res["params"]):
print(-mean_score, params)
0.5096674155773421 {'max_features': 2, 'n_estimators': 3}
0.38494794730392157 {'max_features': 2, 'n_estimators': 10}
0.35890284926470584 {'max_features': 2, 'n_estimators': 30}
0.4765907543572984 {'max_features': 4, 'n_estimators': 3}
0.37949047181372547 {'max_features': 4, 'n_estimators': 10}
0.3677285709422658 {'max_features': 4, 'n_estimators': 30}
0.47674223856209147 {'max_features': 6, 'n_estimators': 3}
0.39086173406862745 {'max_features': 6, 'n_estimators': 10}
0.35285364923747276 {'max_features': 6, 'n_estimators': 30}
0.47786049836601296 {'max_features': 8, 'n_estimators': 3}
0.37944690563725486 {'max_features': 8, 'n_estimators': 10}
0.35524742306644874 {'max_features': 8, 'n_estimators': 30}
0.4390253948801742 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.3897452818627451 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.4490985838779956 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.3858988664215686 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
0.45253914760348585 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
0.3858853860294117 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
As you can notice the lowest MSE is obtained for the best parameter combination. Let's obtain the best estimator as follows :
grid_search.best_estimator_
RandomForestRegressor(max_features=6, n_estimators=30)
NOTE : GridSearchCV is initialized with refit=True option, which retrians the best estimator on the full training set. This is likely to lead us to a better model as it is trained on a larger dataset.
Randomized Search¶
When we have a large hyperparameter space, it is desirable to try RandomizedSearchCV.
RandomizedSearchCV is a wrapper around the RandomizedSearchCV class in the scikit-learn library.
It selects a random value for each hyperparameter at the start of each iteration and repeats the process for the given number of random combinations.
It enables us to search hyperparameter space with appropriate budget control.
from sklearn.model_selection import RandomizedSearchCV
Analysis of best model and its errors¶
Analysis of the model provides useful insights about features. Let's obtain the feature importance as learnt by the model.
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([0.05388861, 0.1266557 , 0.05822554, 0.05641645, 0.05791189,
0.04626635, 0.08045273, 0.07275072, 0.05712442, 0.14163643,
0.24867117])
sorted(zip(feature_importances ,features) ,reverse=True)
[(0.24867116536102707, 'alcohol'), (0.1416364273940636, 'sulphates'), (0.1266556963936701, 'volatile acidity'), (0.0804527251831923, 'total sulfur dioxide'), (0.07275072016325314, 'density'), (0.058225542967296186, 'citric acid'), (0.05791188978825247, 'chlorides'), (0.05712441669365611, 'pH'), (0.05641645467144794, 'residual sugar'), (0.05388861091468476, 'fixed acidity'), (0.046266350469456416, 'free sulfur dioxide')]
Based on this information, we may drop features that are not so important.
It is also useful to analyze the errors in prediction and understand its causes and fix them.
Evaluation on test set¶
Now that we have a reasonable model, we evaluate its performance on the test set. The following steps are involved in the process :
- Transform the best features
# copy all features leaving aside the label
wine_features_test = strat_test_set.drop('quality', axis=1)
# copy the label list
wine_labels_test = strat_test_set['quality'].copy()
# apply transformations
wine_features_test_tr = transform_pipeline.fit_transform(wine_features_test)
- Use the predict method with the trained model and the test set.
quality_test_pred = grid_search.best_estimator_.predict(wine_features_test_tr)
- Compare the predicted labels with the actual ones and report the evaluation metrics.
print('Test Error : ',mean_squared_error(wine_labels_test, quality_test_pred))
Test Error : 0.35345138888888883
- It's a good idea to get 95% confidence interval of the evaluation metric. It can be obtained by the following code:
from scipy import stats
confidence = 0.95
squared_errors = (quality_test_pred - wine_labels_test) **2
stats.t.interval (confidence, len(squared_errors)-1, loc=squared_errors.mean(),scale=stats.sem(squared_errors))
(0.2915927656958191, 0.41531001208195856)
STEP 7 : Present your Solution¶
Once we have satisfactory model based on its performance on the test set, we reach the prelaunch phase.
Before launch :
We need to present our solution that highlights learnings, assumptions and systems limitation.
Document everything, create clear visualizations and present the model.
In case, the model doesn't work better that the experts, it may still be a good idea to launch it and free up bandwidths of human experts.
STEP 8: Launch, Monitor and Maintain your system¶
Launch :
Plug in input sources &
Write test cases
Monitoring :
System outages
Degradation of model performance
Sampling predictions for human evaluation
Regular assessment of data quality, which is critical for model performance.
Maintenance :
Train model regularly every fixed interval with fresh data.
Production roll out of the model.
SUMMARY¶
In this module, we studied steps involved in end-to-end machine learning project with an example of prediction of wine quality.
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Data Cleaning¶
1. Data Imputation¶
Many machine learning algorithms need full feature matrix and they may not work in presence of missing data.
Data imputation identifies missing values in each feature of the dataset and replaces them with an appropriate values based on a fixed strategy such as :
mean or median or mode of that feature.
use specified constant value. Sklearn library provides
sklearn.impute.SimpleImputerclass for this purpose.
from sklearn.impute import SimpleImputer
Some of its important parameters:
missing_values: could be
int,str,np.nanorNone. Default isnp.nan.strategy: string, default is 'mean'. One the following strategies can be used:
mean- missing values are replaced using the mean along each column.median-missing values are replaced using the median along each column.most_frequent-missing values are replaced using the most_frequent along each column.constant- missing values are replaced with value specified infill_valueargument.add_indicator- a boolean parameter that when set toTruereturns missing value indicators inindicator_member value.
NOTE :
meanandmedianstrategies can only be used with numeric data.most_frequentandconstantstrategies can be used with strings or numberic data.
Data imputation on real world dataset.¶
Let's perform data imputation on real world dataset. We will be using https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/cleveland.data for this purpose. We will load this dataset from csv file.
cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
heart_data = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', header=None, names=cols)
heart_data.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63.0 | 1.0 | 1.0 | 145.0 | 233.0 | 1.0 | 2.0 | 150.0 | 0.0 | 2.3 | 3.0 | 0.0 | 6.0 | 0 |
| 1 | 67.0 | 1.0 | 4.0 | 160.0 | 286.0 | 0.0 | 2.0 | 108.0 | 1.0 | 1.5 | 2.0 | 3.0 | 3.0 | 2 |
| 2 | 67.0 | 1.0 | 4.0 | 120.0 | 229.0 | 0.0 | 2.0 | 129.0 | 1.0 | 2.6 | 2.0 | 2.0 | 7.0 | 1 |
| 3 | 37.0 | 1.0 | 3.0 | 130.0 | 250.0 | 0.0 | 0.0 | 187.0 | 0.0 | 3.5 | 3.0 | 0.0 | 3.0 | 0 |
| 4 | 41.0 | 0.0 | 2.0 | 130.0 | 204.0 | 0.0 | 2.0 | 172.0 | 0.0 | 1.4 | 1.0 | 0.0 | 3.0 | 0 |
The dataset has the following features :
Age (in years)
Sex (1 = male; 0 = female)
cp - chest pain type
trestbps - resting blood pressure (anything above 130-140 is typically cause for concern)
fbs - fasting blood sugar (>120 mg/dl) (1 = true; 0 = false)
restecg - resting electrocardiographic results
normal=0
1 = having ST-T wave abnormality;
2 = showing probable or definite left ventricular hypertropy by Estes' criteria
thalch - maximum heart rate achieved
exang - excercise induced angina
1 = yes
0 = no
oldpeak - depression induced by excercise relative to rest
slope - slope of the peak excercise ST segment
1 = unsloping;
2 = flat value;
3 = downsloping
ca - number of major vessels (0-3) colored by fluroscopy
thal - (3 = normal; 6 =fixed defect; 7 = reversable defect)
num - diagnosis of heart disease (angiographic disease status)
0 < 50% diameter narrowing;
1: . 50% diameter narrowing
STEP 1 : Check if dataset has missing values¶
This can be checked via dataset description or by check number of
nanornp.nullin the dataframe. However such check can be performed only for numerical features.For non-numberical features, we can list their unique values and check if there are values like
?.
heart_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 303 entries, 0 to 302 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 303 non-null float64 1 sex 303 non-null float64 2 cp 303 non-null float64 3 trestbps 303 non-null float64 4 chol 303 non-null float64 5 fbs 303 non-null float64 6 restecg 303 non-null float64 7 thalach 303 non-null float64 8 exang 303 non-null float64 9 oldpeak 303 non-null float64 10 slope 303 non-null float64 11 ca 303 non-null object 12 thal 303 non-null object 13 num 303 non-null int64 dtypes: float64(11), int64(1), object(2) memory usage: 33.3+ KB
Let's check if there are any missing values in numerical columns-here we have checked it for all columns in the dataframe.
heart_data.isnull().sum()
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 num 0 dtype: int64
There are two non-numerical features : ca and thal so list their unique values:
print('Unique values in ca:', heart_data.ca.unique())
print('Unique values in thal:', heart_data.thal.unique())
Unique values in ca: ['0.0' '3.0' '2.0' '1.0' '?'] Unique values in thal: ['6.0' '3.0' '7.0' '?']
Both of them contain ?, which is a missing value. Let's count the number of missing values.
print('Number of missing vlaue in ca:',
heart_data.loc[heart_data.ca == '?', 'ca'].count())
print('Number of missing vlaue in thal:',
heart_data.loc[heart_data.thal == '?', 'thal'].count())
Number of missing vlaue in ca: 4 Number of missing vlaue in thal: 2
STEP 2 : Replace ? with NaN¶
heart_data.replace('?' ,np.nan ,inplace=True)
STEP 3 : Fill the mising values with sklearn missing value imputation utilities.¶
Here we use SimpleImputer with mean strategy. We will try two variations :
a. add_indicator=False : Default choice that only imputes missing values.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(heart_data)
heart_data_imputed = imputer.transform(heart_data)
print(heart_data.shape)
print(heart_data_imputed.shape)
(303, 14) (303, 14)
b. add_indicator=True : Adds additional column for each column containing missing values.
In our case, this adds two columns one for ca and other for thal. It indicates if the sample has a missing value.
Now the number of extra column added will be 1 per missing columns that contains the boolean value i.e True/False to indicate that earlier some values were missing. It is just like a pointer for missing value update.
imputer = SimpleImputer(missing_values=np.nan,
strategy='mean', add_indicator=True)
imputer = imputer.fit(heart_data)
heart_data_imputed_with_indicator = imputer.transform(heart_data)
print(heart_data.shape)
print(heart_data_imputed_with_indicator.shape)
(303, 14) (303, 16)
2.Feature Scaling¶
Feature Scaling transform feature values such that all the features are on the same scale.
When we use feature matrix with all features on the same scale, it provides us certain advantages as listed below:
Enables Faster Convergence in iterative optimization algorithms like gradien descent and its variants.
The performance of ML algorithms such as SVM, K-NN and K-means etc. that compute euclidean distance among input samples gets impacted if the features are not scaled.
Tree based Ml algorithms are not affected by feature-scaling. In other words, feature scaling is not required for tree based ML algorithms.
Feature scaling can be performed with the following methods:
Standardization
Normalization
MaxAbsScaler
Feature Scaling on real world dataset.¶
Let's demonstrate feature scaling on a real world dataset. For this purpose we will be using https://archive.ics.uci.edu/ml/datasets/Abalone .
We will use different scaling utilities in the sklearn library.
cols = ['sex', 'Length', 'Diameter', 'Height','Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',header=None, names=cols)
Abalone dataset has the following features :
Sex -nominal (M, F, and I (infant))
Length (mm - Longest shell measurement)
Diameter (mm - perpendicular to lenght)
Height (mm - with meat in shell)
Whole weight (grams -whole abalone)
Shucked weight (grams - whole abalone)
Viscera weight (grams - gut weight (after bleeding))
Shell weight (grams - after being dried)
Rings (target - age in years)
STEP 1 : Examine the dataset¶
abalone_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4177 entries, 0 to 4176 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 4177 non-null object 1 Length 4177 non-null float64 2 Diameter 4177 non-null float64 3 Height 4177 non-null float64 4 Whole weight 4177 non-null float64 5 Shucked weight 4177 non-null float64 6 Viscera weight 4177 non-null float64 7 Shell weight 4177 non-null float64 8 Rings 4177 non-null int64 dtypes: float64(7), int64(1), object(1) memory usage: 293.8+ KB
STEP 1 [Optional] : Convert non-numerical attributes to numerical ones.¶
In this dataset, sex is a non-numeric column in this dataset. Let's examine it and see if we can convert it to numeric representation.
abalone_data.sex.unique()
array(['M', 'F', 'I'], dtype=object)
#Assign numerical values to sex.
abalone_data = abalone_data.replace({"sex": {'M': 1, 'F': 2, 'I': 3}})
abalone_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4177 entries, 0 to 4176 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 4177 non-null int64 1 Length 4177 non-null float64 2 Diameter 4177 non-null float64 3 Height 4177 non-null float64 4 Whole weight 4177 non-null float64 5 Shucked weight 4177 non-null float64 6 Viscera weight 4177 non-null float64 7 Shell weight 4177 non-null float64 8 Rings 4177 non-null int64 dtypes: float64(7), int64(2) memory usage: 293.8 KB
STEP 2 : Separate labels from features.¶
y = abalone_data.pop('Rings')
print('The DataFrame object after deleting the column : \n')
abalone_data.info()
The DataFrame object after deleting the column : <class 'pandas.core.frame.DataFrame'> RangeIndex: 4177 entries, 0 to 4176 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 4177 non-null int64 1 Length 4177 non-null float64 2 Diameter 4177 non-null float64 3 Height 4177 non-null float64 4 Whole weight 4177 non-null float64 5 Shucked weight 4177 non-null float64 6 Viscera weight 4177 non-null float64 7 Shell weight 4177 non-null float64 dtypes: float64(7), int64(1) memory usage: 261.2 KB
STEP 3 : Examine feature scales¶
3A. Statistical method¶
Check the scales of different feature with describe method of dataframe.
abalone_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| sex | 4177.0 | 1.955470 | 0.827815 | 1.0000 | 1.0000 | 2.0000 | 3.000 | 3.0000 |
| Length | 4177.0 | 0.523992 | 0.120093 | 0.0750 | 0.4500 | 0.5450 | 0.615 | 0.8150 |
| Diameter | 4177.0 | 0.407881 | 0.099240 | 0.0550 | 0.3500 | 0.4250 | 0.480 | 0.6500 |
| Height | 4177.0 | 0.139516 | 0.041827 | 0.0000 | 0.1150 | 0.1400 | 0.165 | 1.1300 |
| Whole weight | 4177.0 | 0.828742 | 0.490389 | 0.0020 | 0.4415 | 0.7995 | 1.153 | 2.8255 |
| Shucked weight | 4177.0 | 0.359367 | 0.221963 | 0.0010 | 0.1860 | 0.3360 | 0.502 | 1.4880 |
| Viscera weight | 4177.0 | 0.180594 | 0.109614 | 0.0005 | 0.0935 | 0.1710 | 0.253 | 0.7600 |
| Shell weight | 4177.0 | 0.238831 | 0.139203 | 0.0015 | 0.1300 | 0.2340 | 0.329 | 1.0050 |
Note :
- There are 4177 examples or rows in this dataset.
- The mean and standard deviation of features are quite different from one another.
3B. Visualization of feature distributions¶
This method includes :
Histogram
Kernel density estimation (KDE) plot
Boxplot
Violin plot
- Feature Histogram
We will have separate and combined histogram plots to check if the feature are indeed on different scales.
plt.hist(np.array(abalone_data['Length']))
plt.show()
plt.hist(np.array(abalone_data['Shucked weight']))
plt.show()
for i in abalone_data.columns:
plt.hist(np.array(abalone_data[i]))
Observe that the features have different distributions and scales.
- KDE plot
Alternatively, we can generate Kernel Density Estimate plot using Gaussian Kernels.
In statistics, kernel density function (KDE) is a non-parametric way to estimate the probability density function (PDF) of a random variable. This function uses Gaussian Kernels and includes automatic bandwidth determination.
ax = abalone_data.plot.kde()
Observe that the features have different distributions and scales.
- Boxplot
A box plot (or box-and-whisker plot) shows the distribution of quantitative in a way that facilitates comparisions between variables or across levels of a categorical variables.
The box shows the quartiles of the dataset while the whiskers extend to show the rest of the distribution, except for points that are determined to be 'outliers' using a method that is a function of the inter-quartile range.
ax = sns.boxplot(data=abalone_data, orient='h', palette='Set2')
STEP 4 : Scaling the features¶
4A. Normalization¶
The features are normalized such that their range lies between $[0,1] or [-1,1]$. There are two ways to achieve this :
MaxAbsScalertransform features in range $[-1,1]$MinMaxScalertransforms features in range $[0,1]$
a. MaxAbsScaler
It transforms the original features vector $ \textbf x$ into new feature vector $\textbf x^{'} $ so that all values fall within range [-1,1] and the range of each feature is the same.
\begin{equation} \textbf x^{'} = \frac{\textbf x}{\text {MaxAbsoluteValue}} \end{equation}
where :
\begin{equation} \text {MaxAbsolutevalue}= \text {max}(\textbf x.max,|\textbf x.min|) \end{equation}
x = np.array([4, 2, 5, -2, -100]).reshape(-1, 1)
print(x)
[[ 4] [ 2] [ 5] [ -2] [-100]]
from sklearn.preprocessing import MaxAbsScaler
max_abs_scaler = MaxAbsScaler()
x_mas = max_abs_scaler.fit_transform(x)
print(x_mas)
[[ 0.04] [ 0.02] [ 0.05] [-0.02] [-1. ]]
b. MinMaxScaler
Normalization is a procedure in which the feature values are scaled such that they range between 0 and 1. This technique is also called Min-Max Scaling.
It is performed with the following formula: \begin{equation} \mathbf X_{new} = \frac{X_{old} - X_{min} }{\mathbf X_{max} - X_{min}} \end{equation}
where :
$X_{old}$ is the old value of a data point, which is rescaled to $ X_{new}$.
$X_{min}$ is minimum value of feature $X$
$X_{max}$, is maximum value of feature $X$.
Normalization can be achieved by MinMaxScaler from sklearn library.
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(abalone_data)
X_normalized[:5]
array([[0. , 0.51351351, 0.5210084 , 0.0840708 , 0.18133522,
0.15030262, 0.1323239 , 0.14798206],
[0. , 0.37162162, 0.35294118, 0.07964602, 0.07915707,
0.06624075, 0.06319947, 0.06826109],
[0.5 , 0.61486486, 0.61344538, 0.11946903, 0.23906499,
0.17182246, 0.18564845, 0.2077728 ],
[0. , 0.49324324, 0.5210084 , 0.11061947, 0.18204356,
0.14425017, 0.14944042, 0.15296462],
[1. , 0.34459459, 0.33613445, 0.07079646, 0.07189658,
0.0595158 , 0.05134957, 0.0533134 ]])
Let's look at the mean and standard deviation (SD) of each feature:
X_normalized.mean(axis=0)
array([0.47773522, 0.60674608, 0.59307774, 0.12346584, 0.29280756,
0.24100033, 0.23712127, 0.2365031 ])
X_normalized.std(axis=0)
array([0.4138578 , 0.16226829, 0.16676972, 0.03701066, 0.17366046,
0.14925109, 0.14430695, 0.13870055])
The means of SDs of different features are now comparable. We can confirm this again through visualization as before:
cols = ['sex', 'Length', 'Diameter', 'Height', 'Whole weight',
'Shucked weight', 'Viscera weight', 'Shell weight']
X_normalized = pd.DataFrame(X_normalized, columns=cols)
sns.histplot(data=X_normalized)
<AxesSubplot:ylabel='Count'>
sns.kdeplot(data=X_normalized)
<AxesSubplot:ylabel='Density'>
4B. Standardization¶
Standardization is another feature scaling technique that results into (close to ) zero mean and unit standard deviation of a feature's values.
Formula for standardization: \begin{equation} X_{new} = \frac{X_{old}-\mu}{\sigma} \end{equation}
where, $\mu$ and $\sigma$ respectively are the mean and standard deviation of the feature values.
- Standardization can be achieved by
StandardScalerfrom sklearn library.
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_standardized = ss.fit_transform(abalone_data)
X_standardized[:5]
array([[-1.15434629, -0.57455813, -0.43214879, -1.06442415, -0.64189823,
-0.60768536, -0.72621157, -0.63821689],
[-1.15434629, -1.44898585, -1.439929 , -1.18397831, -1.23027711,
-1.17090984, -1.20522124, -1.21298732],
[ 0.05379815, 0.05003309, 0.12213032, -0.10799087, -0.30946926,
-0.4634999 , -0.35668983, -0.20713907],
[-1.15434629, -0.69947638, -0.43214879, -0.34709919, -0.63781934,
-0.64823753, -0.60759966, -0.60229374],
[ 1.26194258, -1.61554351, -1.54070702, -1.42308663, -1.27208566,
-1.2159678 , -1.28733718, -1.32075677]])
X_standardized.mean(axis=0)
array([-1.19075871e-17, -5.83471770e-16, -3.02792930e-16, 3.91249292e-16,
9.18585294e-17, -1.02065033e-17, 2.70472337e-16, 2.97689679e-16])
X_standardized.std(axis=0)
array([1., 1., 1., 1., 1., 1., 1., 1.])
The means of different features are now comparable with SD = 1
# sns.histplot(data=X_standardized)
in_cols = cols[:len(cols)-1]
plt.figure(figsize=(12, 8))
data = pd.DataFrame(X_standardized, columns=cols)
for colname in abalone_data:
plt.hist(data[colname].values, alpha=0.4)
plt.legend(in_cols, fontsize=18, loc='upper right', frameon=True)
plt.title('Distribution of features across samples after standardization')
plt.xlabel('Range', fontsize=16)
plt.ylabel('Frequency', fontsize=16)
plt.show()
data.plot.kde()
plt.show()
ax = sns.boxplot(data=data, orient='h', palette='Set2')
ax = sns.violinplot(data=data, orient='h', palette='Set2')
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Feature Transformations¶
1.Polynomial Features¶
Generates a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.
For example, if an input sample is two dimensional and of the form $[a,b]$ , the degree-2 polynomial features are $[1,a,a^2,b,b^2 ,ab]$ .
sklearn.preprocessing.PolynomialFeaturesenables us to perform polynomial transformation of desired degree.
Let's demonstrate it with wine quality dataset :
from sklearn.preprocessing import PolynomialFeatures
wine_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')
wine_data_copy = wine_data.copy()
wine_data = wine_data.drop(['quality'] ,axis=1)
print('Number of features before transformation = ', wine_data.shape)
Number of features before transformation = (1599, 11)
poly = PolynomialFeatures(degree=2)
wine_data_poly = poly.fit_transform(wine_data)
print('Number of features after transformation = ', wine_data_poly.shape)
Number of features after transformation = (1599, 78)
Note that after transformation, we have 78 features. Let's list out these features:
poly.get_feature_names_out()
array(['1', 'fixed acidity', 'volatile acidity', 'citric acid',
'residual sugar', 'chlorides', 'free sulfur dioxide',
'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
'fixed acidity^2', 'fixed acidity volatile acidity',
'fixed acidity citric acid', 'fixed acidity residual sugar',
'fixed acidity chlorides', 'fixed acidity free sulfur dioxide',
'fixed acidity total sulfur dioxide', 'fixed acidity density',
'fixed acidity pH', 'fixed acidity sulphates',
'fixed acidity alcohol', 'volatile acidity^2',
'volatile acidity citric acid', 'volatile acidity residual sugar',
'volatile acidity chlorides',
'volatile acidity free sulfur dioxide',
'volatile acidity total sulfur dioxide',
'volatile acidity density', 'volatile acidity pH',
'volatile acidity sulphates', 'volatile acidity alcohol',
'citric acid^2', 'citric acid residual sugar',
'citric acid chlorides', 'citric acid free sulfur dioxide',
'citric acid total sulfur dioxide', 'citric acid density',
'citric acid pH', 'citric acid sulphates', 'citric acid alcohol',
'residual sugar^2', 'residual sugar chlorides',
'residual sugar free sulfur dioxide',
'residual sugar total sulfur dioxide', 'residual sugar density',
'residual sugar pH', 'residual sugar sulphates',
'residual sugar alcohol', 'chlorides^2',
'chlorides free sulfur dioxide', 'chlorides total sulfur dioxide',
'chlorides density', 'chlorides pH', 'chlorides sulphates',
'chlorides alcohol', 'free sulfur dioxide^2',
'free sulfur dioxide total sulfur dioxide',
'free sulfur dioxide density', 'free sulfur dioxide pH',
'free sulfur dioxide sulphates', 'free sulfur dioxide alcohol',
'total sulfur dioxide^2', 'total sulfur dioxide density',
'total sulfur dioxide pH', 'total sulfur dioxide sulphates',
'total sulfur dioxide alcohol', 'density^2', 'density pH',
'density sulphates', 'density alcohol', 'pH^2', 'pH sulphates',
'pH alcohol', 'sulphates^2', 'sulphates alcohol', 'alcohol^2'],
dtype=object)
Observe that :
Some features have ^2 suffix - these are degree-2 features of input features. For example,
sulphates^2is the square ofsulphatesfeatures.Some features are combination of names of the original feature names. For example,
total sulfur dioxide pHis a combinationn of two featurestotal sulfur dioxideandpH.
2.Discretization¶
Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values.
Certain datasets with continuous features may benefit from discretization, because it can transform the datasets of continuous attributes to one with only nominal attributes.
One-hot encoded discretized features can make a model more expressive, while maintaining interpretability.
For instance, pre-processing with discretizer can introduce non-linearity to linear models.
KBinsDiscretizer discretizes features into k-bins.
from sklearn.preprocessing import KBinsDiscretizer
wine_data = wine_data_copy.copy()
#transform the dataset with KBinDiscretizer
kbd = KBinsDiscretizer(n_bins=10, encode='onehot')
X = np.array(wine_data['chlorides']).reshape(-1, 1)
X_binned = kbd.fit_transform(X)
X_binned
<1599x10 sparse matrix of type '<class 'numpy.float64'>' with 1599 stored elements in Compressed Sparse Row format>
X_binned.toarray()[:5]
array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])
3.Handling Categorical Features¶
We need to convert the categorical features into numeric features. It includes :
Ordinal encoding
One hot encoding
Label encoding
MultiLabel Binarizer
Using dummy variables
Iris dataset has the following features:
sepal length (in cm)
sepal width (in cm)
petal length (in cm)
petal width (in cm)
class : Iris Setosa, Iris Versicolour, Iris Virginica
cols = ['sepal length', 'sepal width', 'petal width', 'label']
iris_data = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=cols)
iris_data.head()
| sepal length | sepal width | petal width | label | |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
1. Ordinal Encoding
Categorical features are those that contain categories or groups such as education level, state etc as their data.
These are non-numerical features and need to be converted into appropriate from before they feeding them for training an ML model.
Our intuitive way of handling them could be to assign them a numerical value.
As an example, take state as a feature with 'Punjab', Rajasthan, and Haryana as the possible values. We might consider assigning number to these values as follows:
Old feature | New feature ------------|------------- Punjab | 1 Rajasthan | 2 Haryana | 3
However, this approach assigns some ordering to the labels, i.e. states, thus representing that Haryana is thrice Punjab and Rajasthan is twice Pubjab, these relationships do not exist in the data, thus providing wrong information to the ML model.
Let's demonstrate this concept with Iris dataset.
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder()
iris_labels = np.array(iris_data['label'])
iris_labels_transformed = ordinal.fit_transform(iris_labels.reshape(-1, 1))
print(np.unique(iris_labels_transformed))
print()
print('First 5 labels in ordinal encoded form are : \n',
iris_labels_transformed[:5])
[0. 1. 2.] First 5 labels in ordinal encoded form are : [[0.] [0.] [0.] [0.] [0.]]
2. One-hot Encoding
- This approach consists of creating an addtional feature for each label present in categorical feature(i.e. the number of different states here) and putting a 1 or 0 for these new features depending on the categorical feature's value. That is,
| Old feature | New feature_1 (punjab) | New feature_2 (Rajasthan) | New feature_3(Haryana) |
|---|---|---|---|
| Punjab | 1 | 0 | 0 |
| Rajasthan | 0 | 1 | 0 |
| Haryana | 0 | 1 | 0 |
- It may be implemented using
OneHotEncoderclass from sklearn.preprocessing module.
The label in the iris dataset is a categorical attribute.
iris_data.label.unique()
array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)
There are three class labels. Let's convert them to one hot vectors.
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
print('Shape of y before encoding : ', iris_data.label.shape)
'''
Passing 1d arrays as data to onehotcoder is deprecated in version, hence reshape to (-1,1) to have two dimensions.
Input of onehotencoder fit_transform must not be 1-rank array
'''
iris_labels = one_hot_encoder.fit_transform(iris_data.label.values.reshape(-1, 1))
# y.reshape(-1,1) is a 450 x 1 sparse matrix of type <class numpy.float64>
# y is a 150 x 3 sparse matrix of type <class numpy.float64> with 150 stored
# elements in Coordinate format.
print('Shape of y after encoding : ', iris_labels.shape)
# since output is sparse use toarray() to expand it.
print()
print('First 5 labels in one-hot vector form are : \n',iris_labels.toarray()[:5])
Shape of y before encoding : (150,) Shape of y after encoding : (150, 3) First 5 labels in one-hot vector form are : [[1. 0. 0.] [1. 0. 0.] [1. 0. 0.] [1. 0. 0.] [1. 0. 0.]]
3. Label Encoding
Another option is to use LabelEncoder for transforming categorical features into integer codes.
from sklearn.preprocessing import LabelEncoder
iris_labels = np.array(iris_data['label'])
label = LabelEncoder()
label_integer = label.fit_transform(iris_labels)
print('Labels in integer form are : \n', label_integer)
Labels in integer form are : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
4. MultiLabel Binarizer
Encodes categorical features with value 0 to $ k-1$ where $k$ is number of classes.
As the name suggests for case where output are multilabels there we use each unique label as column and assign 0 or 1 depending upon in the dataset that value is present or not.
Movie genres is best example to understand.
movie_genres = [
{'action', 'comedy'},
{'comedy'},
{'action', 'thriller'},
{'science-fiction', 'action', 'thriller'}
]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform(movie_genres)
array([[1, 1, 0, 0],
[0, 1, 0, 0],
[1, 0, 0, 1],
[1, 0, 1, 1]])
5. Using Dummy variables
Use get_dummies to create a one-hot encoding for each unique categorical value in the 'class' column
iris_data_onehot = pd.get_dummies(
iris_data, columns=['label'], prefix=['one_hot'])
iris_data_onehot.head()
| sepal length | sepal width | petal width | one_hot_Iris-setosa | one_hot_Iris-versicolor | one_hot_Iris-virginica | |
|---|---|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | 1 | 0 | 0 |
| 4.9 | 3.0 | 1.4 | 0.2 | 1 | 0 | 0 |
| 4.7 | 3.2 | 1.3 | 0.2 | 1 | 0 | 0 |
| 4.6 | 3.1 | 1.5 | 0.2 | 1 | 0 | 0 |
| 5.0 | 3.6 | 1.4 | 0.2 | 1 | 0 | 0 |
4.Custom Transformers¶
Enables conversion of an existing Python function into a transformer to assist in data cleaning or processing.
Useful when:
The dataset consists of hetereogeneous data types (e.g. raster images and text captions)
The dataset is stored in a
pandas.DataFrameand different columns require different processing pipelines.We need stateless transformations such as taking the log of frequencies, custom scaling, etc.
We can implement a transformer from an arbitary function with Function Transformer.
from sklearn.preprocessing import FunctionTransformer
For example, let us build a tranformer that applies a log transformation to features.
For this demonstration, we will be using a wine quality dataset from UCI machine learning repository.
It has got the following attributes:
fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol
quality (output: score between 0 and 10)
wine_data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')
wine_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1599.0 | 8.319637 | 1.741096 | 4.60000 | 7.1000 | 7.90000 | 9.200000 | 15.90000 |
| volatile acidity | 1599.0 | 0.527821 | 0.179060 | 0.12000 | 0.3900 | 0.52000 | 0.640000 | 1.58000 |
| citric acid | 1599.0 | 0.270976 | 0.194801 | 0.00000 | 0.0900 | 0.26000 | 0.420000 | 1.00000 |
| residual sugar | 1599.0 | 2.538806 | 1.409928 | 0.90000 | 1.9000 | 2.20000 | 2.600000 | 15.50000 |
| chlorides | 1599.0 | 0.087467 | 0.047065 | 0.01200 | 0.0700 | 0.07900 | 0.090000 | 0.61100 |
| free sulfur dioxide | 1599.0 | 15.874922 | 10.460157 | 1.00000 | 7.0000 | 14.00000 | 21.000000 | 72.00000 |
| total sulfur dioxide | 1599.0 | 46.467792 | 32.895324 | 6.00000 | 22.0000 | 38.00000 | 62.000000 | 289.00000 |
| density | 1599.0 | 0.996747 | 0.001887 | 0.99007 | 0.9956 | 0.99675 | 0.997835 | 1.00369 |
| pH | 1599.0 | 3.311113 | 0.154386 | 2.74000 | 3.2100 | 3.31000 | 3.400000 | 4.01000 |
| sulphates | 1599.0 | 0.658149 | 0.169507 | 0.33000 | 0.5500 | 0.62000 | 0.730000 | 2.00000 |
| alcohol | 1599.0 | 10.422983 | 1.065668 | 8.40000 | 9.5000 | 10.20000 | 11.100000 | 14.90000 |
| quality | 1599.0 | 5.636023 | 0.807569 | 3.00000 | 5.0000 | 6.00000 | 6.000000 | 8.00000 |
Let's use np.log1p which returns natural logarithm of(1 + the feature value).
transformer = FunctionTransformer(np.log1p, validate=True)
wine_data_transformed = transformer.transform(np.array(wine_data))
pd.DataFrame(wine_data_transformed, columns=wine_data.columns).describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1599.0 | 2.215842 | 0.178100 | 1.722767 | 2.091864 | 2.186051 | 2.322388 | 2.827314 |
| volatile acidity | 1599.0 | 0.417173 | 0.114926 | 0.113329 | 0.329304 | 0.418710 | 0.494696 | 0.947789 |
| citric acid | 1599.0 | 0.228147 | 0.152423 | 0.000000 | 0.086178 | 0.231112 | 0.350657 | 0.693147 |
| residual sugar | 1599.0 | 1.218131 | 0.269969 | 0.641854 | 1.064711 | 1.163151 | 1.280934 | 2.803360 |
| chlorides | 1599.0 | 0.083038 | 0.038991 | 0.011929 | 0.067659 | 0.076035 | 0.086178 | 0.476855 |
| free sulfur dioxide | 1599.0 | 2.639013 | 0.623790 | 0.693147 | 2.079442 | 2.708050 | 3.091042 | 4.290459 |
| total sulfur dioxide | 1599.0 | 3.634750 | 0.682575 | 1.945910 | 3.135494 | 3.663562 | 4.143135 | 5.669881 |
| density | 1599.0 | 0.691519 | 0.000945 | 0.688170 | 0.690945 | 0.691521 | 0.692064 | 0.694990 |
| pH | 1599.0 | 1.460557 | 0.035760 | 1.319086 | 1.437463 | 1.460938 | 1.481605 | 1.611436 |
| sulphates | 1599.0 | 0.501073 | 0.093731 | 0.285179 | 0.438255 | 0.482426 | 0.548121 | 1.098612 |
| alcohol | 1599.0 | 2.431458 | 0.090434 | 2.240710 | 2.351375 | 2.415914 | 2.493205 | 2.766319 |
| quality | 1599.0 | 1.885054 | 0.122749 | 1.386294 | 1.791759 | 1.945910 | 1.945910 | 2.197225 |
Simple Examples :
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 9], [7, 8]])
transformer.transform(X)
array([[0. , 2.30258509],
[2.07944154, 2.19722458]])
transformer = FunctionTransformer(np.exp2)
X = np.array([[1,3], [2,4]])
transformer.transform(X)
array([[ 2., 8.],
[ 4., 16.]])
5.Composite Transformers¶
- It applies a set of transformers to columns of an array or
pandas.DataFrame, concatenates the transformed outputs from different transformers into a single matrix.
5.A. Apply Transformation to diverse features
It is useful for transforming heterogeneous data by applying different transformers to separate subsets of features.
It combines different feature selection mechanism and transformation into a single transformer object.
It is a list of tuples.
In the tuple, first we mention the reference name, second the method and third the column on which we want to apply column transformer.
X = [
[20.0,'male'],
[11.2,'female'],
[15.6,'female'],
[13.0,'male'],
[18.6, 'male'],
[16.4,'female']
]
X = np.array(X)
print(X)
[['20.0' 'male'] ['11.2' 'female'] ['15.6' 'female'] ['13.0' 'male'] ['18.6' 'male'] ['16.4' 'female']]
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler ,OneHotEncoder
col_trans = ColumnTransformer([
('scaler' ,MaxAbsScaler() ,[0]),
('pass' ,'passthrough' ,[0]) ,
('encoder' ,OneHotEncoder() ,[1])
])
col_trans.fit_transform(X)
array([['1.0', '20.0', '0.0', '1.0'],
['0.5599999999999999', '11.2', '1.0', '0.0'],
['0.78', '15.6', '1.0', '0.0'],
['0.65', '13.0', '0.0', '1.0'],
['0.93', '18.6', '0.0', '1.0'],
['0.82', '16.4', '1.0', '0.0']], dtype='<U32')
5.B. TransformedTargetRegressor
Transforms the target variable y before fitting a regression model.
The predicted values are mapped back to the original space via an inverse transform.
It takes regressor and transformer as arguments to be applied to the target variable.
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
# select a subset of data
X, y = X[:2000, :], y[:2000]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# transformer to scale the data
transformer = MinMaxScaler()
# first regressor - based on the original labels.
regressor = LinearRegression()
# second regressor - based on transformed labels.
ttr = TransformedTargetRegressor(regressor=regressor, transformer=transformer)
regressor.fit(X_train, y_train)
print('R2 score of raw_label regression: {0:.4f}'.format(
regressor.score(X_test, y_test)))
ttr.fit(X_train, y_train)
print('R2 score of transformed label regression: {0:.4f}'.format(
ttr.score(X_test, y_test)))
R2 score of raw_label regression: 0.5853 R2 score of transformed label regression: 0.5853
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Feature Selection¶
sklearn.feature_selection module has useful APIs to select features/reduce dimensionality, either to improve estimators' accuracy score or to boost their performance on very high-dimensional datasets.
Top reasons to use feature selection are:
It enables the machine learning algorithm to train faster.
It reduces the complexity of a model and makes it easier to interpret.
It improves the accuracy of a model if the right subset is chosen.
It reduces overfitting.
1.FILTER-BASED METHODS¶
1.A. Variance Threshold¶
This transformer helps to keep only high variance features by providing a certain threshold.
Features with variance greater or equal to threshold value are kept rest are removed.
By default, it removes any feature with same value i.e. 0 variance.
data = [{'age': 4, 'height': 96.0},
{'age': 1, 'height': 73.9},
{'age': 3, 'height': 88.9},
{'age': 2, 'height': 81.6}
]
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
data_transformed = dv.fit_transform(data)
np.var(data_transformed, axis=0)
array([ 1.25 , 67.735])
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=5)
data_new = vt.fit_transform(data_transformed)
data_new
array([[96. ],
[73.9],
[88.9],
[81.6]])
As you may observe from output of above cell, the transformer has removed the age feature because its variance is below the threshold.
1.B. SelectKBest¶
It selects k-highest scoring features based on a function and removes the rest of the features.
Let's take an example of California Housing Dataset.
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression
X_california, y_california = fetch_california_housing(return_X_y=True)
X, y = X_california[:2000], y_california[:2000]
Let's select 3 most important features, since it is a regression problem, we can use only mutual_info_regression of f_regression scoring functions only.
# mutual_info_regression is scoring method for linear regression method
skb = SelectKBest(mutual_info_regression, k=3)
X_new = skb.fit_transform(X, y)
print(f'Shape of feature-matrix before feature selection : {X.shape}')
print(f'Shape of feature-matrix after feature selection : {X_new.shape}')
Shape of feature-matrix before feature selection : (2000, 8) Shape of feature-matrix after feature selection : (2000, 3)
1.C. SelectPercentile¶
This is very similar to
SelectKBestfrom previous section, the only difference is, it selects toppercentileof all features and drops the rest of features.Similar to
SelecKBest, it also uses a scoring function to decide the importance of features.
Let's use the california housing price dataset for this API.
from sklearn.feature_selection import SelectPercentile
sp = SelectPercentile(mutual_info_regression, percentile=30)
X_new = sp.fit_transform(X, y)
print(f'Shape of feature-matrix before feature selection : {X.shape}')
print(f'Shape of feature-matrix after feature selection : {X_new.shape}')
Shape of feature-matrix before feature selection : (2000, 8) Shape of feature-matrix after feature selection : (2000, 3)
As you can see from above output, the transformed data now only has top 30 percentile of features, i.e only 3 out of 8 features.
skb.get_feature_names_out()
array(['x0', 'x6', 'x7'], dtype=object)
1.D. GenericUnivariateSelect¶
It applies univariate feature selection with a certain strategy, which is passed to the API via
modeparameter.The
modecan take one of the following values :percentile(top percentage)k_best(top k)fpr(false positive rate)fdr(false discovery rate)fwe(family wise error rate)
If we want to accomplish the same objective as
SelectKBest, we can use following code:
from sklearn.feature_selection import GenericUnivariateSelect
gus = GenericUnivariateSelect(mutual_info_regression, mode='k_best', param = 3)
X_new = gus.fit_transform(X,y)
print(f'Shape of feature-matrix before feature selection : {X.shape}')
print(f'Shape of feature-matrix after feature selection : {X_new.shape}')
Shape of feature-matrix before feature selection : (2000, 8) Shape of feature-matrix after feature selection : (2000, 3)
2.WRAPPER-BASED METHODS¶
2.A. Recursive Feature Elimination (RFE)¶
STEP 1 : Fits the model
STEP 2 : Ranks the features, afterwards it removes one or more features (depending upn
stepparameter)
These two steps are repeated until desired number of features are selected.
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=3)
selector = selector.fit(X, y)
# support_ attribute is a boolean array marking which features are selected
print(selector.support_)
# rank of each feature
# if it's value is '1', then it is selected
# features with rank 2 and onwards are ranked least.
print(f'Rank of each feature is : {selector.ranking_}')
[ True False False False False False True True] Rank of each feature is : [1 3 3 2 3 2 1 1]
X_new = selector.transform(X)
print(f'Shape of feature-matrix before feature selection : {X.shape}')
print(f'Shape of feature-matrix after feature selection : {X_new.shape}')
Shape of feature-matrix before feature selection : (2000, 8) Shape of feature-matrix after feature selection : (2000, 3)
2.B. SelectFromModel¶
Selects desired number of important features (as specified with
max_featuresparameter) above certain threshold of feature importance as obtained from the trained estimator.The feature importance is obtained via
coef_,feature_importance_or animportance_gettercallable from the trained estimator.The feature importance threshold can be specified either numerically or through string argument based on built-in heuristics such as
mean,medianandfloatmultiples of these like0.1*mean.
from sklearn.feature_selection import SelectFromModel
estimator = LinearRegression()
estimator.fit(X, y)
LinearRegression()
print(f'Coefficients of features :\n {estimator.coef_}')
print()
print(f'Intercept of features : {estimator.intercept_}')
print()
print(f'Indices of top {3} features : {np.argsort(estimator.coef_)[-3:]}')
Coefficients of features : [ 3.64048292e-01 5.56221906e-03 5.13591243e-02 -1.64474348e-01 5.90411479e-05 -1.64573915e-01 -2.17724525e-01 -1.85343265e-01] Intercept of features : -13.720597901356236 Indices of top 3 features : [1 2 0]
t = np.argsort(np.abs(estimator.coef_))[-3:]
model = SelectFromModel(estimator, max_features=3, prefit=True)
X_new = model.transform(X)
print(f'Shape of feature-matrix before feature selection : {X.shape}')
print(f'Shape of feature-matrix after feature selection : {X_new.shape}')
Shape of feature-matrix before feature selection : (2000, 8) Shape of feature-matrix after feature selection : (2000, 3)
2.C. SequentialFeatureSelection¶
It performs feature selection by selecting or deselecting features one by one in a greedy manner.
from sklearn.feature_selection import SequentialFeatureSelector
%%time
estimator = LinearRegression()
sfs = SequentialFeatureSelector(estimator, n_features_to_select=3)
sfs.fit_transform(X, y)
print(sfs.get_support())
[ True False False False False True True False] Wall time: 135 ms
The features corresponding to True in the output of sfs.get_support() are selected. In this case,features 1, 6 and 7 are selected.
%%time
estimator = LinearRegression()
sfs = SequentialFeatureSelector(
estimator, n_features_to_select=3, direction='backward')
sfs.fit_transform(X, y)
print(sfs.get_support())
[ True False False False False True True False] Wall time: 194 ms
A couple of observations:
Both
forwardandbackwardselection methods select the same featurers.The
backwardselection method takes longer thanforwardselection method.
From above examples, we can observe that depending upon number of features, SFS can accomplish feature selection in different periods forwards and backwards.
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Feature Extraction¶
Feature Extraction aims to reduce the number of features in a dataset by creating new features from the existing ones (and then discarding the original features).
These new reduced set of features should then be able to summarize most of the information contained in the original set of features.
In this way, a summarised version of the original features can be created from a combination of the original set.
1. DictVectorizer¶
Many a times the data is present as a $\textbf {list of dictionary objects.}$
ML algorithms expect the data in matrix form with shape $(n,m)$ where $n$ is the number of samples and $m$ is the number of features.
Vectorizer converts a list of dictionary objects to feature matrix.
Let's create a sample data for demo purpose containing age and height of children.
Each record/sample is a dictionary with two keys age and height , and corresponding values.
from sklearn.feature_extraction import DictVectorizer
measurements = [
{'city': 'Chennai', 'temperature': 33.},
{'city': 'Kolkata', 'temperature': 18.},
{'city': 'Delhi', 'temperature': 12.}]
vec = DictVectorizer()
vec.fit_transform(measurements).toarray()
array([[ 1., 0., 0., 33.],
[ 0., 0., 1., 18.],
[ 0., 1., 0., 12.]])
vec.get_feature_names_out()
array(['city=Chennai', 'city=Delhi', 'city=Kolkata', 'temperature'],
dtype=object)
2. PCA - Principal Component Analysis¶
PCA is used to decompose a multivariate dataset in a set of successive orthogonal components that capture maximum amount of the variance.
It helps in reducing dimensions of a dataset, thus computational cost of next steps e.g. training a model, cross validation etc.
Let's generate some artificial data to better understand PCA :
rand = np.random.RandomState(1)
X = np.dot(rand.rand(2, 2), rand.randn(2, 200)).T
plt.figure()
plt.title('Data points', size=20)
# set x and y labels
plt.xlabel('$x_1$', size=15)
plt.ylabel('$x_2$', size=15, rotation=0)
# plot the data points
plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
plt.axis('equal')
(-2.7292712056271964, 2.5702744393352615, -0.9326181575178751, 1.004413830229183)
Let us fit a PCA transformer on this data and compute its two principal components:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
PCA(n_components=2)
Let's print the principle axes, they are two 2D vectors for this example.
The PCA object returns them in the form of a matrix, where each column returns them in the form of a matrix, where each column represents a principle component:
print(f'The {pca.n_components_} principle axes are :\n', pca.components_)
The 2 principle axes are : [[-0.94446029 -0.32862557] [-0.32862557 0.94446029]]
Let's also look at the explained variance corresponding to each priciple axes.
print('Explained variance by each component : ', pca.explained_variance_)
Explained variance by each component : [0.7625315 0.0184779]
To better understand PCA, let's visualize these principle axes :
There are two principle axes C1 and C2. They are orthogonal to each other. An additional vector C3 is also mentioned for comparision.
The lengths of C1 and C2 are taken as square root of respective explained variance. The length of the vector implies how important that vector is.
# draw projections of data points on different vectors
projections = X@pca.components_
print(projections.shape)
c3 = X[2]
arbitary_projection = X@c3
print(arbitary_projection.shape)
(200, 2) (200,)
plt.figure(figsize=(12,8))
plt.scatter(projections[:, 0], 1+np.zeros((200, 1)), alpha=0.3, color='r')
plt.scatter(projections[:, 1], -1+np.zeros((200, 1)), alpha=0.3, color='b')
plt.scatter(arbitary_projection, np.zeros((200,)), alpha=0.3, color='grey')
plt.legend(['$\mathbf{C_2}$', '$\mathbf{C_3}$'], prop={'size': 16})
plt.title("variance covered by different vectors", size=20)
plt.ylim([-1.5, 1.5])
plt.yticks([], [])
plt.axis('equal')
plt.grid(True)
plt.xlabel('$z$', size=20)
plt.show()
Reducing Dimensions
We can use PCA to reduce number of dimensions of a dataset. The components that are least important i.e. their explained variance is low, are removed and only those components that capture high(i.e. desired) amount of variance are kept.
Let's reduce the dimension of our data from 2 to 1. We can observe the transformed data has only 1 feature.
pca = PCA(n_components=1)
pca.fit(X)
X_pca = pca.transform(X)
print('Original shape :', X.shape)
print('Transformed shape :',X_pca.shape)
Original shape : (200, 2) Transformed shape : (200, 1)
To better understand what happened to our data, let's visualize our original data and the reduced data.
To do this, we will need to bring the transformed data into space or original data, which can be accomplished by inverse_transform method of PCA object.
plt.figure()
plt.title('Data and candidate vectors', size=20)
# set x and y labels
plt.xlabel('$x_1$', size=20)
plt.ylabel('$x_2$', size=20, rotation=0)
# plot data points
plt.scatter(X[:, 0], X[:, 1], alpha=0.3)
for length, principal_axis, axis_name, i_color in zip(pca.explained_variance_,
pca.components_,['$\mathbf{C_1}$', '$\mathbf{C_2}$'], ['r', 'b']):
v = principal_axis * np.sqrt(length)
v0, v1 = pca.mean_, pca.mean_ + v
# draw principal axis
plt.quiver(*v0, *(v1-v0), scale=0.33, scale_units='xy', color=i_color)
# label the principal axis
plt.text(*(3.4*v1), axis_name, size=20)
# draw 3rd component
lengths = np.eye(2)
np.fill_diagonal(lengths, np.sqrt(pca.explained_variance_))
c3 = pca.mean_+[-0.5, 0.3]
plt.quiver(*pca.mean_,*(1.1*(c3-pca.mean_)), scale=1, scale_units='xy',
color='grey')
# label the principal axis
plt.text(*(1.4*c3),'$\mathbf{C_3}$',size=20,color='grey')
plt.axis('equal')
plt.show()
From above chart it is clear that the new/transformed data points are now projected on C1 vector.
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Handling Imbalanced Data¶
Imbalanced datasets are those where one class is very less represented than the other class. This kind of data results in less efficient ML algorithm.
There are two main approaches to handle imbalanced data:
- Undersampling
- Oversampling

We will demonstrate how to handle imbalance with the help of wine quality dataset that we have used earlier.
wine_data = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')
wine_data.shape
(1599, 12)
wine_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
wine_data.quality.value_counts(ascending=True)
3 10 8 18 4 53 7 199 6 638 5 681 Name: quality, dtype: int64
# display the histograms of the target variable 'quality'
wine_data['quality'].hist(bins=50)
plt.xlabel('Quality')
plt.ylabel('Number of samples')
plt.show()
1. Undersampling¶
Undersampling refers to sampling from the majority class in order to keep only a part of these data points.
It may be carried out using RandomUnderSampler class from imblearn library.
from imblearn.under_sampling import RandomUnderSampler
#class
class_count_3, class_count_4, class_count_5, class_count_6, class_count_7, class_count_8 = wine_data['quality'].value_counts()
# separate class
class_3 = wine_data[wine_data['quality'] == 3]
class_4 = wine_data[wine_data['quality'] == 4]
class_5 = wine_data[wine_data['quality'] == 5]
class_3 = wine_data[wine_data['quality'] == 3]
class_6 = wine_data[wine_data['quality'] == 6]
class_7 = wine_data[wine_data['quality'] == 7]
class_8 = wine_data[wine_data['quality'] == 8]
# print the shape of the class
print('class 3:', class_3.shape)
print('class 4:', class_4.shape)
print('class 5:', class_5.shape)
print('class 6:', class_6.shape)
print('class 7:', class_7.shape)
print('class 8:', class_8.shape)
wine_data.plot.hist()
plt.show()
class 3: (10, 12) class 4: (53, 12) class 5: (681, 12) class 6: (638, 12) class 7: (199, 12) class 8: (18, 12)
# It allows you to count the items in an iterable list.
from collections import Counter
X = wine_data.drop(['quality'],axis=1)
y = wine_data['quality']
undersampler = RandomUnderSampler(random_state =0)
X_rus, y_rus = undersampler.fit_resample(X,y)
print('Original dataset shape : ',y.shape)
print('Resampled dataset shape : ', y_rus.shape)
print()
print(Counter(y))
print(Counter(y_rus))
Original dataset shape : (1599,)
Resampled dataset shape : (60,)
Counter({5: 681, 6: 638, 7: 199, 4: 53, 8: 18, 3: 10})
Counter({3: 10, 4: 10, 5: 10, 6: 10, 7: 10, 8: 10})
The class with the least number of samples is '3'.
Hence all the other class samples are reduced to the number of samples in the least class.
2. Oversampling¶
Oversampling refers to replicating some points from the minority class in order to increase the cardinality of the minority class.
This might consist of either replicating or generating synthetic data for the minority class.
It may be carried out using RandomOverSampler class from imblearn library.
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)
print('Original dataset shape : ', y.shape)
print('Resampled dataset shape : ', y_ros.shape)
print()
print(Counter(y))
print(Counter(y_ros))
Original dataset shape : (1599,)
Resampled dataset shape : (4086,)
Counter({5: 681, 6: 638, 7: 199, 4: 53, 8: 18, 3: 10})
Counter({5: 681, 6: 681, 7: 681, 4: 681, 8: 681, 3: 681})
print('New random points generated with RandomOverSampler : ',X_ros.shape[0] - X.shape[0])
New random points generated with RandomOverSampler : 2487
The class with the majority number of samples is '5'. Hence all the other class samples that are lesser than this class count are newly sampled to the number of samples in the majority class.
Oversampling using SMOTE¶
SMOTE (Synthetic Minority Oversampling Technique) is a popular technique for over sampling. It is available under imblean library.
from imblearn.over_sampling import SMOTE
oversampler = SMOTE()
X_smote, y_smote = oversampler.fit_resample(X, y)
Counter(y_smote)
Counter({5: 681, 6: 681, 7: 681, 4: 681, 8: 681, 3: 681})
print('New random points generated with SMOTE : ', X_ros.shape[0] - X.shape[0])
New random points generated with SMOTE : 2487
Types of SMOTE:
Borderline SMOTE
Borderline-SMOTE SVM
Adaptive Synthetic Sampling(ADASYN)
Import necessary libraries
from IPython.display import display, Math, Latex
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
Chaining Transformers¶
The preprocessing transformations are applied one after another on the input feature matrix.
It is important to apply exactly same transformation on training, evaluation and test set in the same order.
Failing to do so would lead to incorrect predictions from model due to distribution shift and hence incorrect performance evaluation.
The
sklearn.pipelinemodule provides utilities to build a composite estimator, as a chain of transformers and estimators.
Pipeline¶
Sequentially apply a list of transformers and estimators.
Intermediate steps of the pipeline must be 'transformer' i.e, they must implement
fitandtransformmethods.The final estimator only needs to implement
fit.
The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
1.Creating Pipelines¶
A pipeline can be created with Pipeline().
It takes a list of ('estimatorsName',estimator(...)) tuples. The pipeline object exposes interface of the last step.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
estimators = [
('simpleImputer' , SimpleImputer()),
('standardScaler' , StandardScaler()),
]
pipe = Pipeline(steps=estimators)
The same pipeline can also be created via make_pipeline() helper function, which doesn't take names of the steps and assigns them generic names based on their steps.
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SimpleImputer(), StandardScaler())
2.Accessing Individual steps in a Pipeline¶
from sklearn.decomposition import PCA
estimators = [
('simpleImputer', SimpleImputer()),
('pca', PCA()),
('regressor', LinearRegression())
]
pipe = Pipeline(steps=estimators)
Let's print number of steps in this pipeline:
print(len(pipe.steps))
3
Let's look at each of the steps:
print(pipe.steps)
[('simpleImputer', SimpleImputer()), ('pca', PCA()), ('regressor', LinearRegression())]
The second estimator can be accessed in following 4 ways:
print(pipe.named_steps.regressor)
LinearRegression()
pipe.steps[1]
('pca', PCA())
pipe['pca']
PCA()
3.Accessing parameters of a step in pipeline¶
Parameters of the estimators in the pipeline can be accessed using the __syntax, note there are two underscores.
estimators = [
('simpleImputer', SimpleImputer()),
('pca', PCA()),
('regressor', LinearRegression())
]
pipe = Pipeline(steps=estimators)
pipe.set_params(pca__n_components=2)
Pipeline(steps=[('simpleImputer', SimpleImputer()),
('pca', PCA(n_components=2)),
('regressor', LinearRegression())])
In above example n_components of PCA() step is set after the pipeline is created.
4.GridSeachCV with Pipeline¶
By using naming convention of nested parameters, grid search can be implemented.
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = dict(
imputer=['passthrough',SimpleImputer(),KNNImputer()],
clf=[SVC(), LogisticRegression()],
clf_C=[0.1, 1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)
cis an inverse of regularization, lower its value stronger the regularization is.In the example above
clf_Cprovides a set of values for grid search.
Caching Transformers¶
Transforming data is a computationally expensive step.
For grid search, transformers need not be applied for every parameter configuration.
They can be applied only once, and the transformed data can be reused.
This can be achived by setting
memoryparameter ofpipelineobject.
import tempfile
tempDirPath = tempfile.TemporaryDirectory()
estimators = [
('simpleImputer', SimpleImputer()),
('pca', PCA(2)),
('regressor',LinearRegression())
]
pipe = Pipeline(steps = estimators ,memory = tempDirPath)
FeatureUnion¶
Concatenates results of multiple transformer objects.
Applies a list of transformer objects in parallel, and their outputs are concatenated side-by-side into a larger matrix.
FeatuerUnionandPipelinecan be used to create complex transformers.
5.Visualizing Pipelines¶
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('selector', ColumnTransformer([(
'select_first_4', 'passthrough', slice(0, 4))])),
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
])
cat_pipeline = ColumnTransformer([
('label_binarizer', LabelBinarizer(), [4]),
])
full_pipeline = FeatureUnion(transformer_list=[('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline)
])
from sklearn import set_config
set_config(display='diagram')
#displays HTML representation in a jupyter context
full_pipeline
FeatureUnion(transformer_list=[('num_pipeline',
Pipeline(steps=[('selector',
ColumnTransformer(transformers=[('select_first_4',
'passthrough',
slice(0, 4, None))])),
('imputer',
SimpleImputer(strategy='median')),
('scaler', StandardScaler())])),
('cat_pipeline',
ColumnTransformer(transformers=[('label_binarizer',
LabelBinarizer(),
[4])]))])Please rerun this cell to show the HTML repr or trust the notebook.FeatureUnion(transformer_list=[('num_pipeline',
Pipeline(steps=[('selector',
ColumnTransformer(transformers=[('select_first_4',
'passthrough',
slice(0, 4, None))])),
('imputer',
SimpleImputer(strategy='median')),
('scaler', StandardScaler())])),
('cat_pipeline',
ColumnTransformer(transformers=[('label_binarizer',
LabelBinarizer(),
[4])]))])ColumnTransformer(transformers=[('select_first_4', 'passthrough',
slice(0, 4, None))])slice(0, 4, None)
passthrough
SimpleImputer(strategy='median')
StandardScaler()
[4]
LabelBinarizer()
Linear regression with sklearn API¶
The objective of this notebook is to demonstrate how to build a linear regression model with sklearn.
We will be using the following set up:
Dataset : California Housing
Regression API :
LinearRegressionTraining :
fit(normal equation) andcross_validate(normal equation with cross validation).Evaluation :
score(r2 Score) andcross_val_scorewith different scoring parameters.
We will study the model diagnosis with LearningCurve and learn how to examine the learned model or weight vector.
Importing the libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import permutation_test_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
np.random.seed(42)
plt.style.use('seaborn')
We will use ShuffleSplit cross validation with:
10 folds (n_splits) and
set aside 20% examples as test examples (
test_size)
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
STEP 1: Load the dataset¶
The first step is to load the dataset. We have already discussed how to load California Housing dataset in the last demonstration.
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
The feature matrix is loaded in features dataframes and the labels in labels dataframe.
Let's examine the shapes of these two dataframes.
print('Shape of feature matrix : ', features.shape)
print('Shape of labels matrix : ', labels.shape)
Shape of feature matrix : (20640, 8) Shape of labels matrix : (20640,)
As a sanity check, make sure that the number of rows in feature matrix and labels match.
assert (features.shape[0]==labels.shape[0])
STEP 2: Data Exploration¶
Data exploration has beein covered in week 4 notebook.
STEP 3: Preprocessing and model building¶
3A. Train-test split¶
The first step is to split the training data into test set. We do not access the test data till the end.
All data exploration and tuning is performed on the training set and by setting aside a small portion of training as a dev or validation set.
The following code snippet divides the data into training and test sets :
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(
features, labels, random_state=42)
train_features.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 15480 entries, 8158 to 15795 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 15480 non-null float64 1 HouseAge 15480 non-null float64 2 AveRooms 15480 non-null float64 3 AveBedrms 15480 non-null float64 4 Population 15480 non-null float64 5 AveOccup 15480 non-null float64 6 Latitude 15480 non-null float64 7 Longitude 15480 non-null float64 dtypes: float64(8) memory usage: 1.1 MB
Let's examine the shapes of training and tet sets :
print('Number of training samples : ', train_features.shape[0])
print('Number of test samples : ', test_features.shape[0])
Number of training samples : 15480 Number of test samples : 5160
It's time to perform another sanity check-here we check if the training feature matrix has the same umber of rows as the training label vector.
We perform the same check on the test set too.
assert (train_features.shape[0] == train_labels.shape[0])
assert (test_features.shape[0] == test_labels.shape[0])
3B. Pipeline : Preprocessing + Model Building¶
As a first step, build linear regression models with default parameter setting of
LinearRegressionAPIs.We will make use of
PipelineAPI for combining data preprocessing and model building.We will use
StandardScalerfeature scaling to bring all features on the same scale followed by aLinearRegressionmodel.
The Pipeline object has two components:
StandardScaleras step1LinearRegressionas step2
After constructing the pipeline object, let's train it with set :
lin_reg_pipeline = Pipeline([
('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())
])
lin_reg_pipeline.fit(train_features, train_labels)
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])StandardScaler()
LinearRegression()
Now that we have trained the model, let's check the learnt / estimated weight vectors (intercept_, coef_) :
print('Intercept (w_0) : ',lin_reg_pipeline[-1].intercept_)
print()
print('Weight vector (w_1,w_2....,w_m) : \n' ,lin_reg_pipeline[-1].coef_)
Intercept (w_0) : 2.0703489205426377 Weight vector (w_1,w_2....,w_m) : [ 0.85210815 0.12065533 -0.30210555 0.34860575 -0.00164465 -0.04116356 -0.89314697 -0.86784046]
A few things to notice:
We accessed the
LinearRegressionobject aslin_reg_pipeline[-1]which is the last step in pipeline.The intercept can be obtained via
intercept_memeber variable andThe weight vector correspoinding to features via
coef_.
STEP 4: Model Evaluation¶
Let's use score method to obtain train and test errors with twin objectives.
Estimation of model performance as provided by test error.
Comparision of errors for model diagnostic purpose (underfit /overfit /just the right fit)
#evaluate model performance on both train and test set.
train_score = lin_reg_pipeline.score(train_features, train_labels)
print('Model performance on train set :', train_score)
test_score = lin_reg_pipeline.score(test_features, test_labels)
print('Model performance on test set :', test_score)
Model performance on train set : 0.609873031052925 Model performance on test set : 0.5910509795491352
The
scoremethod returnsr2score whose best value is 1.The
r2scores on training and test are comparable but they are not that high.It points to underfitting issue in model training.
4A. Cross validation sccore (cross_val_score)¶
Since the
scorewas computed on one fold that was selected as a test set, it may not be all that robust.In order to obtain robust estimate of the performance, we use
cross_val_scorethat calculatesscoreon different test folds through cross validation.
lin_reg_score = cross_val_score(lin_reg_pipeline, train_features, train_labels,scoring='neg_mean_squared_error' , cv=shuffle_split_cv)
print('Model performance on cross validation set : \n', lin_reg_score)
Model performance on cross validation set : [-0.50009976 -0.52183352 -0.55931218 -0.52110499 -0.56059203 -0.50510767 -0.52386194 -0.54775518 -0.5007161 -0.54713448]
print(
f'Score of linear regression model on the test set : \n'f"{lin_reg_score.mean():.3f} +/- {lin_reg_score.std():.3f}")
Score of linear regression model on the test set : -0.529 +/- 0.022
Here we got the negative mean squred error as a score. We can convert that to error as follows:
lin_reg_mse = - lin_reg_score
print(
f'MSE of linear regression model on the test set :\n' f'{lin_reg_mse.mean():.3f} +/- {lin_reg_mse.std():.3f}')
MSE of linear regression model on the test set : 0.529 +/- 0.022
We can use other scoring parameters and obtain cross validated scores based on that parameter.
The following choices are available for scoring:
expalined_variance
max_error
neg_mean_absolute_error
neg_root_mean_squared_log_error
neg_median_absolute_error
neg_mean_absolute_percentage_error
r2 score
4B. Cross validation¶
We just calculated cross_val_score based on the cross validation.
It however return only scores of each fold. What if we also need to access the models trained in each fold along with some other statistics like
train errorfor that fold.cross_validateAPI enables us to obtain them.
lin_reg_cv_results = cross_validate(lin_reg_pipeline ,train_features ,train_labels ,scoring='neg_mean_squared_error' ,return_train_score=True ,return_estimator=True ,cv=shuffle_split_cv)
The lin_reg_cv_results is a dictionary with the following contents :
trained
estimatorstime taken for fitting (
fit_time) and scoring(score_time) the models in cross validation,training score (
train_score) andtest scores (
test_score)
Returns of cross_validate score¶
scoresdict of float arrays of shape (n_splits,)
Array of scores of the estimator for each run of the cross validation.
A dict of arrays containing the score/time arrays for each scorer is returned.
The possible keys for this dict are:
test_score
The score array for test scores on each cv split.
Suffix_scoreintest_scorechanges to a specific metric liketest_r2ortest_aucif there are multiple scoring metrics in the scoring parameter.
train_score
The score array for train scores on each cv split.
Suffix_scoreintrain_scorechanges to a specific metric liketrain_r2ortrain_aucif there are multiple scoring metrics in the scoring parameter.This is available only if
return_train_scoreparameter isTrue.
fit_time
The time for fitting the estimator on each cv split.
This is available only if
return_fit_timeparameter isTrue.The time for fitting the estimator on the train set for each cv split.
score_time
The time for scoring the estimator on the test set for each cv split. (Note time for scoring on the train set is not included even if return_train_score is set to True)
estimator
The estimator objects for each cv split.
This is available only if
return_estimatorparameter is set toTrue.
Let's print the contents of the dictionary for us to examine :
lin_reg_cv_results
{'fit_time': array([0.00901079, 0.00700426, 0.0069983 , 0.00900698, 0.00699949,
0.0059979 , 0.00700974, 0.00700021, 0.00800037, 0.00799966]),
'score_time': array([0.00099754, 0.00100017, 0.00099826, 0.0009954 , 0.00100088,
0.00099993, 0.00101876, 0.00199103, 0.00099969, 0.0009973 ]),
'estimator': [Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])],
'test_score': array([-0.50009976, -0.52183352, -0.55931218, -0.52110499, -0.56059203,
-0.50510767, -0.52386194, -0.54775518, -0.5007161 , -0.54713448]),
'train_score': array([-0.52578695, -0.52035137, -0.51095597, -0.52049611, -0.51060835,
-0.52453922, -0.51994311, -0.5144039 , -0.52578473, -0.51397105])}
There are 10 values in each dictionary key. That is because of
cv=10 or 10-fold cross validation that we used.We compare training and test errors to access generalization performance of our model. However we have training and test scores in the
cv_resultsdictionary.Multiply these scores by -1 and convert them to errors.
train_error = -1 * lin_reg_cv_results['train_score']
test_error = -1 * lin_reg_cv_results['test_score']
print(f'Mean squared error of linear regression model on the train set:\n',
f'{train_error.mean():.3f} +\- {train_error.std():.3f}')
print()
print(f'Mean squared error of linear regression model on the test set:\n',
f'{test_error.mean():.3f} +\- {test_error.std():.3f}')
Mean squared error of linear regression model on the train set: 0.519 +\- 0.006 Mean squared error of linear regression model on the test set: 0.529 +\- 0.022
4C. Learning Curve / Effect of training set size on ERROR¶
Let's understand how the training set size or #samples affect the error.
We can use Learning_curve API that calculates cross validation scores for different #samples as specified in argument train_sizes.
#@ title [Plot learning curves]
def plot_learning_curve(train_sizes, train_scores, test_scores):
train_score_mean = np.mean(-train_scores, axis=1)
train_score_std = np.std(-train_scores, axis=1)
test_score_mean = np.mean(-test_scores, axis=1)
test_score_std = np.std(-test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
plt.fill_between(train_sizes,
train_score_mean - train_score_std,
train_score_mean + train_score_std,
alpha=0.1,
color='r',)
plt.fill_between(train_sizes,
test_score_mean + test_score_std,
test_score_mean - test_score_std,
alpha=0.1,
color='g')
plt.plot(train_sizes, train_score_mean, "o-", color='r', lw=2)
plt.plot(train_sizes, test_score_mean, "o-", color='g', lw=2)
plt.xlabel("Training examples ")
plt.ylabel("MSE")
# plt.legend(loc="best")
return plt.show()
(train_sizes, train_scores, test_scores, fit_times, score_times) = learning_curve(lin_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv,scoring='neg_mean_squared_error', n_jobs=-1,
return_times=True, train_sizes=np.linspace(0.2, 1, 10))
plot_learning_curve(train_sizes, train_scores, test_scores)
Observing that :
Both curves have reached a plateau; they are close and fairly high.
Few instances in the training set means the model can fit them perfectly. But as more instances are added to the training set, it becomes impossible for the model to fit the training data perfectly.
When the model is trained on very few training instances, it is not able of generalizing properly, which is why the validation error is initially quite high.
Then as the model learns on more training examples, the training and validation error reduce slowly.
These learning curves are typical of underfitting model.
4D. Scalability Curve / Effect of training set size on FIT TIME¶
We can also study how training scales as the function of number of training samples.
#@ title [Plot Scalability curves]
def plot_scalability_curve(train_sizes, fit_times):
train_score_mean = np.mean(-train_scores, axis=1)
train_score_std = np.std(-train_scores, axis=1)
test_score_mean = np.mean(-test_scores, axis=1)
test_score_std = np.std(-test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
plt.fill_between(train_sizes,
fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std,
alpha=0.1,
color='g',)
plt.plot(train_sizes, fit_times_mean, "o-", color='b', lw=2)
plt.xlabel("Training examples ")
plt.ylabel("fit time")
# plt.legend(loc="best")
return plt.show()
plot_scalability_curve(train_sizes,fit_times)
As the number of training examples grows, the time to fit also increases.
4E. Model Examination¶
Let's examine the weight vectors and how much variability exists between them across different cross-validated models.
feature_names = train_features.columns
feature_names
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
'Latitude', 'Longitude'],
dtype='object')
For this we will first construct a dataframe of weight vectors and then plot them with boxplot.
coefs = [i[-1].coef_ for i in lin_reg_cv_results["estimator"]]
weights_df = pd.DataFrame(coefs, columns=feature_names)
color = {'whiskers':'black','medians':'green','caps':'blue'}
weights_df.plot.box(color=color, vert=False,figsize=(12,12))
plt.title('Linear regression coefficients')
plt.show()
There is not much variability in weights by different models. It can also be seen in the standard deviation of weights as seen in std row below
weights_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MedInc | 10.0 | 0.852153 | 0.005763 | 0.843517 | 0.848871 | 0.852711 | 0.854743 | 0.861659 |
| HouseAge | 10.0 | 0.122770 | 0.004312 | 0.117394 | 0.119895 | 0.121592 | 0.125520 | 0.130560 |
| AveRooms | 10.0 | -0.304384 | 0.010998 | -0.318971 | -0.310778 | -0.305795 | -0.302980 | -0.278426 |
| AveBedrms | 10.0 | 0.353528 | 0.018044 | 0.307474 | 0.349715 | 0.360139 | 0.364646 | 0.367952 |
| Population | 10.0 | -0.001576 | 0.003254 | -0.005787 | -0.002688 | -0.002063 | -0.000350 | 0.005297 |
| AveOccup | 10.0 | -0.037059 | 0.008582 | -0.047584 | -0.042486 | -0.042325 | -0.027562 | -0.026331 |
| Latitude | 10.0 | -0.894011 | 0.009975 | -0.908058 | -0.901411 | -0.895370 | -0.883967 | -0.881884 |
| Longitude | 10.0 | -0.869753 | 0.009097 | -0.884206 | -0.876542 | -0.869631 | -0.862110 | -0.857844 |
4F. Model Selection¶
Let's select the model with the lowest cross validated test error as the best performance model.
lin_reg_cv_results['estimator']
[Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())]),
Pipeline(steps=[('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])]
best_model_index = np.argmin(test_error)
selected_model = lin_reg_cv_results['estimator'][best_model_index]
Let's examine the model coefficients and intercepts :
print('Intercept (w_0) :',selected_model['lin_reg'].intercept_)
print()
print('Coefficients (w_1,w_2.....,w_m) : \n',selected_model['lin_reg'].coef_)
Intercept (w_0) : 2.0779898917958657 Coefficients (w_1,w_2.....,w_m) : [ 8.44324888e-01 1.18463901e-01 -3.04619574e-01 3.56620503e-01 1.74458509e-04 -4.23964612e-02 -8.96045642e-01 -8.68906479e-01]
4G. Model Performance¶
Towards this, let's first obtain the predictions for test points in cross validation.
from sklearn.model_selection import cross_val_predict
cv_predictions = cross_val_predict(lin_reg_pipeline, train_features, train_labels)
mse_cv = mean_squared_error(train_labels, cv_predictions)
plt.scatter(train_labels, cv_predictions, color='blue')
plt.plot(train_labels, train_labels, 'r-')
plt.title(f'Mean squared error = {mse_cv:.2f}', size=18)
plt.xlabel('Actual Median House value', size=12)
plt.ylabel('Predicted Median House value', size=12)
plt.show()
The model seems to be all over the place in its predictions for examples with label 5.
There are some negative predictions. We can fix this by adding a constraints on the weights to be positive.
At this stage, we should perform error analysis and check where the predictions are going wrong.
We can revisit feature construction, preprocessing or model stages and make the necessary course corrections to get better performance.
STEP 5 : Predictions¶
We can use the best performing model from cross validation for getting predictions on the test set.
test_predictions_cv = selected_model.predict(test_features)
test_predictions_cv[:5]
array([0.73548515, 1.7725621 , 2.70011199, 2.83716602, 2.60743151])
test_predictions = lin_reg_pipeline.predict(test_features)
test_predictions[:5]
array([0.72412832, 1.76677807, 2.71151581, 2.83601179, 2.603755 ])
STEP 6 : Report Model Performance¶
We report the model perfromance on the test set.
score_cv = selected_model.score(test_features, test_labels)
score = lin_reg_pipeline.score(test_features, test_labels)
print('R2 score for the best model obtained via cross validation :', score_cv)
print('R2 score for model w/o cv :', score)
R2 score for the best model obtained via cross validation : 0.5923577635319087 R2 score for model w/o cv : 0.5910509795491352
Alternatively we can use any other metric of interest and report performance based on that.
For example, the mean squared error is as follows:
mse_cv = mean_squared_error(test_labels, test_predictions_cv)
mse = mean_squared_error(test_labels, test_predictions)
print('MSE for the best model obtained via cross validation :', mse_cv)
print('MSE for model w/o cv : ', mse)
MSE for the best model obtained via cross validation : 0.5393995876218524 MSE for model w/o cv : 0.5411287478470688
Testing Model on other metrics¶
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, shuffle=False, test_size=0.2, random_state=0)
lin_reg_pipeline = Pipeline([
('scaling', StandardScaler()),
('lin_reg', LinearRegression())
])
lin_reg_pipeline.fit(X_train, y_train)
test_score = lin_reg_pipeline.score(X_test, y_test)
test_score
0.660514059153199
y_pred = lin_reg_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)
0.703338350752188
explained_variance_score(y_test, y_pred)
0.6605500501742702
max_error(y_test,y_pred)
7.260453292958401
mean_absolute_error(y_test, y_pred)
0.5168526993787042
mean_squared_error(y_test, y_pred)
0.4946848356388078
BASELINE MODELS¶
Now, we will build a couple of baseline models using DummyRegression and permutation_test_score.
We will compare performance of our linear regression model with these two baselines.
We will use ShuffleSplit as a cross validation strategy
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
Let's load the data and split it into training and test.
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
train_features, test_features, train_labels, test_labels = train_test_split(
features, labels, random_state=42)
1. Linear Regression Classifier¶
Build linear regression model with feature scaling as part of a pipeline.
Train the model with 10-fold cross validation via ShuffleSplit.
Capture errors on different folds.
lin_reg_pipeline = Pipeline([
('feature scaling', StandardScaler()),
('lin_reg', LinearRegression())
])
lin_reg_cv_results = cross_validate(lin_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv, scoring='neg_mean_absolute_error', n_jobs=-1)
lin_reg_error = pd.Series(-lin_reg_cv_results['test_score'],
name='Linear regressor error')
lin_reg_cv_results.keys()
dict_keys(['fit_time', 'score_time', 'test_score'])
2. Dummy Regression Classifier¶
def dummy_regressor_baseline(strategy, constant_val=None, quantile_val=None):
baseline_model_median = DummyRegressor(strategy=strategy,
constant=constant_val,
quantile=quantile_val)
baseline_median_cv_results = cross_validate(
baseline_model_median, train_features, train_labels, cv=shuffle_split_cv, n_jobs=-1, scoring='neg_mean_absolute_error')
return pd.Series(-baseline_median_cv_results['test_score'], name="Dummy regressor error")
baseline_median_cv_results_errors = dummy_regressor_baseline(strategy='median')
baseline_mean_cv_results_errors = dummy_regressor_baseline(strategy='mean')
baseline_constant_cv_results_errors = dummy_regressor_baseline(
strategy='constant', constant_val=2)
baseline_quantile_cv_results_errors = dummy_regressor_baseline(
strategy='quantile', quantile_val=0.55)
Let's compare performance of these Dummy Regressors:
dummy_error_df = pd.concat([baseline_median_cv_results_errors,
baseline_mean_cv_results_errors,
baseline_constant_cv_results_errors,
baseline_quantile_cv_results_errors], axis=1)
dummy_error_df.columns = ['Median CV', 'Mean CV', 'Constant CV', 'Quantile CV']
dummy_error_df
| Median CV | Mean CV | Constant CV | Quantile CV | |
|---|---|---|---|---|
| 0 | 0.881187 | 0.918341 | 0.902993 | 0.891847 |
| 1 | 0.873773 | 0.898484 | 0.886983 | 0.877153 |
| 2 | 0.876366 | 0.894307 | 0.885488 | 0.877260 |
| 3 | 0.892083 | 0.915112 | 0.904889 | 0.895924 |
| 4 | 0.876835 | 0.909209 | 0.895593 | 0.884722 |
| 5 | 0.874685 | 0.903733 | 0.890999 | 0.880856 |
| 6 | 0.864097 | 0.902037 | 0.886215 | 0.874646 |
| 7 | 0.888523 | 0.917216 | 0.904945 | 0.894905 |
| 8 | 0.890963 | 0.921880 | 0.908874 | 0.898323 |
| 9 | 0.886765 | 0.919369 | 0.905994 | 0.895436 |
Plotting erros using barplot
dummy_error_df.plot.hist(bins=50, density=True, edgecolor='black')
plt.legend(bbox_to_anchor=(1.05,0.8), loc='upper left')
plt.xlabel('Mean absolute error(k$)', size=12)
plt.ylabel('Frequency', size=12)
plt.title('Distribution of the testing errors' ,size=16)
plt.show()
Permutation_test_score¶
It permutes the target to generate randomized data and computes the empirical p-value against the null hypothesis, that features and targets are independent.
Here we are interested in permutation_score returned by this API, which indicates score of the model on different permutations.
score, permutation_score, pvalue = permutation_test_score(
lin_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv, scoring='neg_mean_absolute_error', n_jobs=-1, n_permutations=30)
permutation_errors = pd.Series(-permutation_score, name='Permuted error')
print('Permutation test score :\n', permutation_score)
Permutation test score : [-0.91446539 -0.91608713 -0.91501122 -0.91112203 -0.91326112 -0.91428719 -0.91694297 -0.90660687 -0.90873595 -0.91546138 -0.9084695 -0.91174023 -0.91857102 -0.91467076 -0.90396709 -0.91239289 -0.91095499 -0.91729623 -0.90529415 -0.91436609 -0.91993036 -0.91661883 -0.91104746 -0.91563156 -0.91014294 -0.91526135 -0.90680247 -0.90796435 -0.91032999 -0.91545574]
Model Comparision¶
dummy_error_df.plot.hist(bins=50, density=True, edgecolor='black')
plt.legend(bbox_to_anchor=(1.05,0.8), loc='upper left')
plt.xlabel('Mean absolute error(k$)', size=12)
plt.ylabel('Frequency', size=12)
plt.title('Distribution of the testing errors' ,size=16)
plt.show()
errors_df = pd.concat([lin_reg_error, baseline_median_cv_results_errors,permutation_errors], axis=1)
errors_df.plot.hist(bins=50, density=True, edgecolor='black')
plt.legend(bbox_to_anchor=(1.05, 0.8), loc='upper left')
plt.xlabel('Mean absolute error(k$)', size=12)
plt.ylabel('Frequency', size=12)
plt.title('Distribution of the testing errors', size=16)
plt.show()
Linear regression with iterative optimization: SGDRegressor¶
In this notebook, we will build linear regression model, with SGDRegressor.
SGD offers a lot of control over optimization procedure through a number of hyperparameters. However, we need to set them to right values in order to make it work for training the model.
Importing the libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import permutation_test_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
np.random.seed(306)
plt.style.use('seaborn')
We will use ShuffleSplit as a cross validation strategy.
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
STEP 1: Load the dataset¶
features ,labels = fetch_california_housing(as_frame=True, return_X_y=True)
STEP 2 : Preprocessing¶
Split data into training and test sets.
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
Divide the training data into train and dev sets.
train_features ,dev_features, train_labels, dev_labels = train_test_split(
com_train_features, com_train_labels, random_state=42)
STEP 3 : Model Building¶
Baseline SGDRegressor¶
Step 1 : To begin with, we instantiate a baseline
SGDRegressormodel with default parameters.Step 2 : Train the model with training feature matrix and labels.
Step 3 : Obtain the score on the training and dev data.
sgd = SGDRegressor(random_state=42)
sgd.fit(train_features, train_labels)
train_mae = mean_absolute_error(train_labels, sgd.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
Train MAE: 309190327803747.2 Dev MAE: 311959782899622.1
We can observe that the mean absolute error is too high. The baseline model doesn't train well. This may happen due to large learning rate.
Let's investigate this issue by training the model step by step and recording training loss in each step.
Adding a feature scaling step¶
We know that, SGD is sensitive to feature scaling. Let's add a feature scaling step and check if we get better MAE.
sgd_pipeline = Pipeline([
('scaler' , StandardScaler()),
('sgd' , SGDRegressor())
])
sgd_pipeline.fit(train_features, train_labels)
train_mae = mean_absolute_error(train_labels, sgd.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
Train MAE: 309190327803747.2 Dev MAE: 311959782899622.1
The error is still high.
Let's run SGDRegressor step by step and investigate issues with training :
Step 1 : Instantiate
SGDRegressorwithwarm_start = Trueandtol=-np.infty.Step 2 : Train SGD step by step and record regression loss in each step.
Step 3 : Plot learning curves and see if there are any issues in training.
eta0 = 1e-2
sgd_pipeline = Pipeline([
('scaler', StandardScaler()),
('sgd',SGDRegressor(max_iter=1, tol = -np.infty, warm_start=True, random_state=42))
])
loss = []
for epoch in range(100):
sgd_pipeline.fit(train_features, train_labels) #continues where it left off
loss.append(mean_squared_error(train_labels, sgd_pipeline.predict(train_features)))
plt.plot(np.arange(len(loss)), loss, 'g-')
plt.xlabel('Number of iterations ')
plt.ylabel('MSE')
plt.title(f'Learning curve: eta0={eta0:.3f}')
plt.show()
eta0 = 1e-3
sgd_pipeline = Pipeline([
('feature_scaling', StandardScaler()),
('sgd',SGDRegressor(max_iter=1, tol = -np.infty, warm_start=True, eta0=eta0,random_state=42))
])
loss = []
for epoch in range(100):
sgd_pipeline.fit(train_features, train_labels)
loss.append(mean_squared_error(train_labels, sgd_pipeline.predict(train_features)))
plt.plot(np.arange(len(loss)), loss, 'g-')
plt.xlabel('Number of iterations ')
plt.ylabel('MSE')
plt.title(f'Learning curve: eta0={eta0:.3f}')
plt.show()
The is an ideal learning curve where the train loss reduces monotonically as the training progresses.
print("Number of iteration before reaching convergence criteria :",sgd_pipeline[-1].n_iter_)
print("Number of weight updates : ", sgd_pipeline[-1].t_)
Number of iteration before reaching convergence criteria : 1 Number of weight updates : 11611.0
Checking train and dev mean absolute error.
train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
Train MAE: 0.5333732983042573 Dev MAE: 0.5190150280908042
Fixing learning rate through validation curves¶
Step 1 : Provide the list of values to be tried for a hyperparameter.
Step 2 : Instantiate an object of
validation_curvewith estimator, training features and label. Setscoringparameter to relevant score.Step 3 : Convert scores to error.
Step 4 : Plot validation curve with the value of hyper-parameter on x-axis and error on the y-axis
Step 5 : Fix the hyper-parameter value where the test error is the least.
%%time
eta0 = [1e-5, 1e-4, 1e-3, 1e-2]
train_scores, test_scores = validation_curve(
sgd_pipeline, com_train_features, com_train_labels, param_name="sgd__eta0", param_range=eta0, cv=shuffle_split_cv, scoring='neg_mean_squared_error', n_jobs=2
)
Wall time: 1.86 s
train_errors, test_errors = -train_scores, -test_scores
plt.plot(eta0, train_errors.mean(axis=1),'g-x',label='Training error')
plt.plot(eta0, test_errors.mean(axis=1),'r--x', label='Test error')
plt.legend()
plt.xlabel('eta0')
plt.ylabel('Mean absolute error')
plt.title('Validation curve for SGD')
plt.show()
For eta0=1e-3, the test error is the least and hence we select that value as the value for eta0.
Next we also plot standard deviation in errors.
plt.errorbar(eta0, train_errors.mean(axis=1), yerr=train_errors.std(axis=1), label='Training error')
plt.errorbar(eta0, test_errors.mean(axis=1),yerr=test_errors.std(axis=1), label='Testing error')
plt.legend(loc='best')
plt.xlabel('eta0')
plt.ylabel('Mean absolute error')
plt.title('Validation curve for SGD')
plt.show()
Experimenting with learning rate parameter¶
1. No learning rate parameter¶
sgd_pipeline = Pipeline([
("scaler", StandardScaler()),
("sgd", SGDRegressor(max_iter=500,
early_stopping=True,
eta0=1e-3,
tol=1e-3,
validation_fraction=0.2,
n_iter_no_change=5,
average=10,
random_state=42))
])
sgd_pipeline.fit(train_features, train_labels)
train_mae = mean_absolute_error(train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
print()
# development set dev set
print('Number of SGD iterations :', sgd_pipeline[-1].n_iter_)
print('Number of weight updates : ', sgd_pipeline[-1].t_)
Train MAE: 0.5433287489797834 Dev MAE: 0.5345612073911922 Number of SGD iterations : 35 Number of weight updates : 406351.0
2. learning rate = 'constant'¶
sgd_pipeline = Pipeline([
("scaler", StandardScaler()),
("sgd", SGDRegressor(max_iter=500,
early_stopping=True,
eta0=1e-3,
tol=1e-3,
learning_rate= 'constant',
validation_fraction=0.2,
n_iter_no_change=5,
average=10,
random_state=42))
])
sgd_pipeline.fit(train_features, train_labels)
train_mae = mean_absolute_error(
train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
print()
# development set dev set
print('Number of SGD iterations :', sgd_pipeline[-1].n_iter_)
print('Number of weight updates : ', sgd_pipeline[-1].t_)
Train MAE: 0.5528203432984674 Dev MAE: 0.5689568216133667 Number of SGD iterations : 11 Number of weight updates : 127711.0
3. learning rate = 'adaptive'¶
sgd_pipeline = Pipeline([
("scaler", StandardScaler()),
("sgd", SGDRegressor(max_iter=500,
early_stopping=True,
eta0=1e-3,
tol=1e-3,
learning_rate='adaptive',
validation_fraction=0.2,
n_iter_no_change=5,
average=10,
random_state=42))
])
sgd_pipeline.fit(train_features, train_labels)
train_mae = mean_absolute_error(
train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
print()
# development set dev set
print('Number of SGD iterations :', sgd_pipeline[-1].n_iter_)
print('Number of weight updates : ', sgd_pipeline[-1].t_)
Train MAE: 0.5375525445454805 Dev MAE: 0.5200254740759911 Number of SGD iterations : 40 Number of weight updates : 464401.0
Setting max_iters parameter¶
max_iter = np.ceil(1e6/com_train_features.shape[0])
max_iter
65.0
sgd_pipeline = Pipeline([
("scaler", StandardScaler()),
("sgd", SGDRegressor(max_iter=max_iter,
early_stopping=True,
eta0=1e-3,
tol=1e-3,
learning_rate='adaptive',
validation_fraction=0.2,
n_iter_no_change=5,
average=10,
random_state=42))
])
sgd_pipeline.fit(train_features, train_labels)
train_mae = mean_absolute_error(
train_labels, sgd_pipeline.predict(train_features))
dev_mae = mean_absolute_error(dev_labels, sgd_pipeline.predict(dev_features))
print('Train MAE: ', train_mae)
print('Dev MAE: ', dev_mae)
print()
# development set dev set
print('Number of SGD iterations :', sgd_pipeline[-1].n_iter_)
print('Number of weight updates : ', sgd_pipeline[-1].t_)
Train MAE: 0.5375525445454805 Dev MAE: 0.5200254740759911 Number of SGD iterations : 40 Number of weight updates : 464401.0
SUMMARY :¶
In this notebook, we saw:
how to build
SGDRegressormodel.how to tune the learning rate.
how to use different
learning_ratesand their impact on convergence.how to use early stopping and averaged SGD
how to tune hyper-parameters using
validation_curves.
California housing dataset¶
This notebook introduces California housing dataset that we will be using for regression demonstration.
We also list down the steps for typical dataset exploration, which can be applied broadly to any dataset.
Loading the dataset¶
This dataset can be fetched from sklearn with fetch_california_housing API.
from sklearn.datasets import fetch_california_housing
from scipy.stats import loguniform
from scipy.stats import uniform
In order to analyze the dataset, let's load it as a dataframe.
california_housing = fetch_california_housing(as_frame=True)
type(california_housing)
sklearn.utils._bunch.Bunch
The bunch object is a dictionary like object with the following attributes:
data, is a pandas object (sinceas_frame=True).Each row corresponds to 8 features values.
targetvalue contains average house value in units of 100_000. This is also a pandas object (sinceas_frame=True).DESCR contains description of the dataset.
framecontains dataframe with data and target
Each of these attributes can be accessed as <bunch_object>.key. In our case, we can access these features as follows:
california_housing.datagives us access to contents ofdatakey.california_housing.targetgives us access to contents oftargetkey.california_housing.feature_namesgives us access to contents offeature_nameskey.california_housing.DESCRgives us access to contents ofDESCRkey.california_housing.framegives us access to contents offramekey.
Dataset Exploration¶
STEP 1: Dataset description¶
Let's look at the description of the dataset.
print(california_housing.DESCR)
.. _california_housing_dataset:
California Housing dataset
--------------------------
**Data Set Characteristics:**
:Number of Instances: 20640
:Number of Attributes: 8 numeric, predictive attributes and the target
:Attribute Information:
- MedInc median income in block group
- HouseAge median house age in block group
- AveRooms average number of rooms per household
- AveBedrms average number of bedrooms per household
- Population block group population
- AveOccup average number of household members
- Latitude block group latitude
- Longitude block group longitude
:Missing Attribute Values: None
This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).
This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).
An household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surpinsingly large values for block groups with few households
and many empty houses, such as vacation resorts.
It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.
.. topic:: References
- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297
Note down key statistics from this description such as number of examples (or sample or instances) from the description :
There are 20640 examples in the dataset.
There are 8 numberical attributes per example
The target label is median house value.
There are no missing values in this dataset.
STEP 2 : Examine shape of feature matrix¶
Number of examples and features can be obtained via shape of california_housing.data.
california_housing.data.shape
(20640, 8)
type(california_housing.data)
pandas.core.frame.DataFrame
STEP 3 : Examine shape of label¶
Let's look at the shape of label vector.
california_housing.target.shape
(20640,)
type(california_housing.target)
pandas.core.series.Series
STEP 4: Get Feature names¶
Let's find out names of the attributes / features.
california_housing.feature_names
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Note the attributes and their description, which is a key step in understanding the data.
MedInc - median income in block
HouseAge - median house age in block
AveRooms - average number of rooms
AveBedrms - average number of bedrooms
Population - block population
AveOccup - Average house occupancy
Latitude - house block latitude
Longitude - house block longitude
STEP 5: Examine sample training examples¶
Let's look at a few training examples along with labels.
# frame.head() for both features and labels
california_housing.frame.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
The dataset contains aggregated data about each district in California
STEP 6: Examine features¶
Let's look at the features.
# data.head() for only features
california_housing.data.head()
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 |
We have information about :
Demography of each district (income, population, house occupancy,
Location of the disctricts (latitude and longitude) &
Characteristics of houses in the district (#rooms, #bedrooms, age of house)
Since the information is aggregated at the district levels, the features corresponds to average or median.
STEP 7: Examine target¶
Let's look at the target to be predicted.
# target.head() for only labels
california_housing.target.head()
0 4.526 1 3.585 2 3.521 3 3.413 4 3.422 Name: MedHouseVal, dtype: float64
The target contains median of the house value for each district. We can see that the target is a real number and hence this is a regression problem.
STEP 8: Examine details of features and labels¶
Let's look at the details of features and target labels.
california_housing.frame.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 20640 non-null float64 1 HouseAge 20640 non-null float64 2 AveRooms 20640 non-null float64 3 AveBedrms 20640 non-null float64 4 Population 20640 non-null float64 5 AveOccup 20640 non-null float64 6 Latitude 20640 non-null float64 7 Longitude 20640 non-null float64 8 MedHouseVal 20640 non-null float64 dtypes: float64(9) memory usage: 1.4 MB
We observe that :
The dataset contains 20640 examples with 8 features.
All features are numerical features encoded as floating point numbers.
There are no missing values in any features - the
non-nullis equal to the number of examples in the training set.
STEP 9: Feature and target histograms.¶
Let's look at the distribution of these features and target by plotting their histograms.
import matplotlib.pyplot as plt
import seaborn as sns
california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.5, wspace=0.4)
Let's observe these histogram and note down our findings:
MedInc has a long tail distribution-salary of people is more or less normally distributed with a few folks getting a high salary.
HouseAge has more or less a uniform distribution.
The range for features, AveRooms, AveBedrms, AveOccups, Population, is large and its contains a small number of large values(as there are unnoticable bins on the right in the histograms plots of these features). That would mean that there could be certain outlier values present in these features.
Latitude and Longitude carry geographical information. Their combination helps us decide price of the house.
MedHouseVal also has a long tail distribution. It spikes towards the end. The reason is that the houses with price more than 5 are given value of 5.
STEP 10: Feature and target statistics¶
Let's look at statistics of these features and the target.
california_housing.frame.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MedInc | 20640.0 | 3.870671 | 1.899822 | 0.499900 | 2.563400 | 3.534800 | 4.743250 | 15.000100 |
| HouseAge | 20640.0 | 28.639486 | 12.585558 | 1.000000 | 18.000000 | 29.000000 | 37.000000 | 52.000000 |
| AveRooms | 20640.0 | 5.429000 | 2.474173 | 0.846154 | 4.440716 | 5.229129 | 6.052381 | 141.909091 |
| AveBedrms | 20640.0 | 1.096675 | 0.473911 | 0.333333 | 1.006079 | 1.048780 | 1.099526 | 34.066667 |
| Population | 20640.0 | 1425.476744 | 1132.462122 | 3.000000 | 787.000000 | 1166.000000 | 1725.000000 | 35682.000000 |
| AveOccup | 20640.0 | 3.070655 | 10.386050 | 0.692308 | 2.429741 | 2.818116 | 3.282261 | 1243.333333 |
| Latitude | 20640.0 | 35.631861 | 2.135952 | 32.540000 | 33.930000 | 34.260000 | 37.710000 | 41.950000 |
| Longitude | 20640.0 | -119.569704 | 2.003532 | -124.350000 | -121.800000 | -118.490000 | -118.010000 | -114.310000 |
| MedHouseVal | 20640.0 | 2.068558 | 1.153956 | 0.149990 | 1.196000 | 1.797000 | 2.647250 | 5.000010 |
We can observe that there is a large difference between 75% and max values of AveRooms, AveBedrms, population and AveOccups- which confirms our intuition about presence of outliers or extreme values in these features.
STEP 11 : Pairplot¶
_ = sns.pairplot(data=california_housing.frame, hue = 'MedHouseVal', palette='viridis')
A few observations based on pairplot:
MedIncomeseems to be useful in distinguishing between low and high valued houses.A few features have extreme values.
Latitude and logitude together seem to distinguish between low and high valued houses.
Summary¶
Explored california housing dataset that would be used for demonstrating implementation of linear regression models.
Examined various statistics of the dataset - #samples, #labels
Examined distribution of features through histogram and pairplots.
Linear Regression for house-price prediction¶
In this notebook, we will build different regression models for california house price prediction:
Linear Regression (with normal equation)
SGD Regression (linear regression with iterative optimization)
Polynomial Regression
Regularized Regression models : RIDGE & LASSO
We will set regularization rate and polynomial degree with hyper-parameter tuning and cross validation.
We will compare different models in terms of their parameter vectors and mean absolute error on train, eval and test sets.
Imports¶
For regression problems, we need to import classes and utilities from
sklearn.linear_model.This module has implementation for different regression models like, LinearRegression, SGDRegressor, Ridge, Lasso, RidgeCV and LassoCV.
We also need to import a bunch of model selection utilities from
sklearn.model_selectionmodule and metrics fromsklearn.metricsmodule.The data preprocessing utilities are imported from
sklearn.preprocessingmodules.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import loguniform
from scipy.stats import uniform
from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
Common set up¶
Set up random seed to a number of your choice.
np.random.seed(306)
Let's use ShuffleSplit as cv with 10 splits and 20% examples set aside as test examples.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
Data Loading and Splitting¶
We use california housing dataset for this demo.
We will load this dataset with
fetch_california_housingAPI as a dataframe.We will load the data and split it inot three parts- train, dev and test. Train+Dev will be used for cross validation and test will be used for evaluating the trained models.
# Fetching dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
# train-test-split
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
# train --> train + dev split
train_features, dev_features, train_labels, dev_labels = train_test_split(
com_train_features, com_train_labels, random_state=42)
Throughout this notebook, we will have the following pattern for each estimator:
We will be using
pipelinefor combining data preprocessing and modelling steps.cross_validatefor training the model withShuffleSplitcross validation andneg_mean_absolute_erroras a scoring metric.Convert the scores to error and report mean absolute errors on the dev set.
1. Linear Regression (with normal equation)¶
Let's use normal equation method to train linear regression model.
We set up pipeline with two stages:
Feature scaling to scale the features and
Linear regression on the transformed feature matrix.
Throughout this notebook, we will have the following pattern for each estimator:
We will be using
pipelinefor combining data preprocessing and modelling steps.cross_validatefor training the model withShuffleSplitcross validation andneg_mean_absolute_erroras a scoring metric.Convert the scores to error and report mean absolute errors on the dev set.
lin_reg_pipeline = Pipeline([
("feature_scaling", StandardScaler()),
("lin_reg", LinearRegression())
])
lin_reg_cv_results = cross_validate(lin_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring="neg_mean_absolute_error",
return_train_score=True,
return_estimator=True)
lin_reg_train_error = -1 * lin_reg_cv_results['train_score']
lin_reg_test_error = -1 * lin_reg_cv_results['test_score']
print(f"Mean absolute error of linear regression model on the train set:\n" f"{lin_reg_train_error.mean():.3f} +/- {lin_reg_train_error.std():.3f}")
print()
print(f"Mean absolute error of linear regression model on the test set:\n" f"{lin_reg_test_error.mean():.3f} +/- {lin_reg_test_error.std():.3f}")
Mean absolute error of linear regression model on the train set: 0.530 +/- 0.002 Mean absolute error of linear regression model on the test set: 0.527 +/- 0.008
Both the errors are close, but are not low. This points to underfitting. We can address it by adding more feature through polynomial regression.
2. SGD Regression (iterative optimization)¶
Let's use iterative optimization method to train linear regression model.
We set up pipeline with two stages:
Feature scaling to scale features and
SGD regression on the transformed feature matrix
sgd_reg_pipeline = Pipeline([
('feature_scaling', StandardScaler()),
('sgd_reg', SGDRegressor(max_iter=np.ceil(1e6/com_train_features.shape[0]),
early_stopping=True,
eta0=1e-4,
learning_rate='constant',
tol=1e-5,
validation_fraction=0.1,
n_iter_no_change=5,
average=10,
random_state=42))
])
sgd_reg_cv_results = cross_validate(sgd_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv, # shufflesplit declared above
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
sgd_train_error = -1 * sgd_reg_cv_results['train_score']
sgd_test_error = -1 * sgd_reg_cv_results['test_score']
print(f"Mean absolute error of SGD regression model on the train set:\n" f"{sgd_train_error.mean():.3f} +/- {sgd_train_error.std():.3f}")
print(f"Mean absolute error of SGD regression model on the test set:\n" f"{sgd_test_error.mean():.3f} +/- {sgd_test_error.std():.3f}")
Mean absolute error of SGD regression model on the train set: 0.534 +/- 0.013 Mean absolute error of SGD regression model on the test set: 0.530 +/- 0.010
SGD Regression : Regularization & Hyper-parameter tuning¶
We can also perform regularization with SGD. SGDRegressor has many hyperparameters that require careful tuning to achieve the same performance as wtih LinearRegression.
poly_sgd_pipeline = Pipeline([
('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('sgd_reg', SGDRegressor(
penalty='elasticnet',
random_state=42
))])
poly_sgd_cv_results = cross_validate(poly_sgd_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
poly_sgd_train_error = -1 * poly_sgd_cv_results['train_score']
poly_sgd_test_error = -1 * poly_sgd_cv_results['test_score']
print(f"Mean absolute error of SGD regression model on the train set. \n {poly_sgd_train_error.mean():.3f} +/- {poly_sgd_train_error.std():.3f}")
print(f"Mean absolute error of SGD regression model on the test set. \n {poly_sgd_test_error.mean():.3f} +/- {poly_sgd_test_error.std():.3f}")
Mean absolute error of SGD regression model on the train set. 10824283052.546 +/- 4423288211.832 Mean absolute error of SGD regression model on the test set. 10946788540.250 +/- 5396536227.703
The error is too high.
So now, lets search for the best set of parameters for polynomial + SGD pipeline with
RandomizedSearchCV.In
RandomizedSearchCV, we need to specify distributions for hyperparameters.
class uniform_int:
"""
Integer valued version of the uniform distributions
"""
def __init__(self, a, b):
self._distribution = uniform(a, b)
def rvs(self, *args, **kwargs):
""" Random variable sample"""
return self._distribution.rvs(*args, **kwargs).astype(int)
Let's specify RandomizedSearchCV set up.
param_distributions = {
'poly__degree':[1,2,3],
'sgd_reg__learning_rate': ['constant', 'adaptive', 'invscaling'],
'sgd_reg__l1_ratio': uniform(0,1),
'sgd_reg__eta0': loguniform(1e-5,1),
'sgd_reg__power_t': uniform(0,1)
}
poly_sgd_random_search_cv = RandomizedSearchCV(
poly_sgd_pipeline, param_distributions=param_distributions, n_iter =10, cv=cv, verbose=1, scoring='neg_mean_absolute_error'
)
poly_sgd_random_search_cv.fit(com_train_features,com_train_labels)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
RandomizedSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling',
StandardScaler()),
('sgd_reg',
SGDRegressor(penalty='elasticnet',
random_state=42))]),
param_distributions={'poly__degree': [1, 2, 3],
'sgd_reg__eta0': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236BA2BA580>,
'sgd_reg__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236B8ADA220>,
'sgd_reg__learning_rate': ['constant',
'adaptive',
'invscaling'],
'sgd_reg__power_t': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236B8ADA550>},
scoring='neg_mean_absolute_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling',
StandardScaler()),
('sgd_reg',
SGDRegressor(penalty='elasticnet',
random_state=42))]),
param_distributions={'poly__degree': [1, 2, 3],
'sgd_reg__eta0': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236BA2BA580>,
'sgd_reg__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236B8ADA220>,
'sgd_reg__learning_rate': ['constant',
'adaptive',
'invscaling'],
'sgd_reg__power_t': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000236B8ADA550>},
scoring='neg_mean_absolute_error', verbose=1)Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('sgd_reg',
SGDRegressor(penalty='elasticnet', random_state=42))])PolynomialFeatures()
StandardScaler()
SGDRegressor(penalty='elasticnet', random_state=42)
The best score can be obtained as follows :
poly_sgd_random_search_cv.best_score_
-0.5271573234998475
The best set of parameters are obtained as follows:
poly_sgd_random_search_cv.best_params_
{'poly__degree': 2,
'sgd_reg__eta0': 2.3063822138075523e-05,
'sgd_reg__l1_ratio': 0.4590765695698895,
'sgd_reg__learning_rate': 'adaptive',
'sgd_reg__power_t': 0.17208702250773023}
And the best estimator can be accessed as follows :
poly_sgd_random_search_cv.best_estimator_
Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('sgd_reg',
SGDRegressor(eta0=2.3063822138075523e-05,
l1_ratio=0.4590765695698895,
learning_rate='adaptive', penalty='elasticnet',
power_t=0.17208702250773023, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('sgd_reg',
SGDRegressor(eta0=2.3063822138075523e-05,
l1_ratio=0.4590765695698895,
learning_rate='adaptive', penalty='elasticnet',
power_t=0.17208702250773023, random_state=42))])PolynomialFeatures()
StandardScaler()
SGDRegressor(eta0=2.3063822138075523e-05, l1_ratio=0.4590765695698895,
learning_rate='adaptive', penalty='elasticnet',
power_t=0.17208702250773023, random_state=42)3. Polynomial Regression¶
We will train a polynomial model with degree 2 and later we will use
validation_curveto find out right degree to use for polynomial models.PolynomialFeaturestransforms the features to the user specified degrees (here it is 2).We perform feature scaling on the transformed features before using them for training the regression model.
poly_reg_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])
poly_reg_cv_results = cross_validate(poly_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
poly_reg_train_error = -1 * poly_reg_cv_results['train_score']
poly_reg_test_error = -1 * poly_reg_cv_results['test_score']
print(f"Mean absolute error of polynomial regression model of degree 2 on the train set: \n" f"{poly_reg_train_error.mean():.3f} +/- {poly_reg_train_error.std():.3f}")
print(f"Mean absolute error of polynomial regression model of degree 2 on the test set: \n" f"{poly_reg_test_error.mean():.3f} +/- {poly_reg_test_error.std():.3f}")
Mean absolute error of polynomial regression model of degree 2 on the train set: 0.461 +/- 0.003 Mean absolute error of polynomial regression model of degree 2 on the test set: 0.485 +/- 0.030
Notice that the training and validation errors have reduced after using the second order polynomial features to represent the model.
Instead of using all polynomial feature, we use only interaction feature terms (i.e interaction_only = True ) in polynomial model and train the linear regression model.
poly_reg_pipeline = Pipeline([
('poly', PolynomialFeatures(
degree=2, interaction_only=True)),
('feature_scaling', StandardScaler()),
('lin_reg', LinearRegression())])
poly_reg_cv_results = cross_validate(poly_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
poly_reg_train_error = -1 * poly_reg_cv_results['train_score']
poly_reg_test_error = -1*poly_reg_cv_results['test_score']
print(f"Mean absolute error of polynomial regression model of degree 2 on the train set: \n" f"{poly_reg_train_error.mean():.3f} +/- {poly_reg_train_error.std():.3f}")
print(f"Mean absolute error of polynomial regression model of degree 2 on the test set: \n" f"{poly_reg_test_error.mean():.3f} +/- {poly_reg_test_error.std():.3f}")
Mean absolute error of polynomial regression model of degree 2 on the train set: 0.478 +/- 0.003 Mean absolute error of polynomial regression model of degree 2 on the test set: 0.497 +/- 0.024
Notice that the training and validation errors have increased after using interaction_only = True to represent the model.
Let's figure out which degree polynomial is better suited for the regression problem at our hand. For that we will use validation_curve, which can be considered as a manual huperparameter tuning.
Here we specify a list of values that we want to try for polynomial degree and specify it as a parameter in validation_curve.
degree = [1, 2, 3, 4, 5]
train_scores, test_scores = validation_curve(
poly_reg_pipeline, com_train_features, com_train_labels, param_name='poly__degree',
param_range=degree, cv=cv, scoring='neg_mean_absolute_error', n_jobs=2
)
train_errors, test_errors = -train_scores, -test_scores
plt.plot(degree, train_errors.mean(axis=1), 'b-x', label="Training error")
plt.plot(degree, test_errors.mean(axis=1), 'r-x', label="Test error")
plt.legend()
plt.xlabel("degree")
plt.ylabel("Mean absolute error (k$)")
plt.title("Validation curve for polynomial regression")
plt.show()
We would select a degree for which the mean absolute error is the least.
In this case, it is degree = 2 that yields the least mean absolute error and that would be selected as an optimal degree for polynomial regression.
4. Ridge Regression¶
The polynomial models have a tendency to overfit - if we use higher order polynomial features.
We will use
Ridgeregression - which penalizes for excessive model complexity in the polynomial regression by adding a regularization term.Here we specify the regularization rate
alphaas 0.5 and train the regression model.Later we will launch hyperparameter search for the right value of
alphasuch that it leads to the least cross validation errors.
ridge_reg_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('feature_scaling', StandardScaler()),
('ridge', Ridge(alpha=0.5))])
ridge_reg_cv_results = cross_validate(ridge_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
ridge_reg_train_error = -1 * ridge_reg_cv_results['train_score']
ridge_reg_test_error = -1 * ridge_reg_cv_results['test_score']
print(f'Mean absolute error of ridge regression model (alpha=0.5) the train set: \n' f'{ridge_reg_train_error.mean():.3f} +/- {ridge_reg_train_error.std():.3f}')
print(f'Mean absolute error of ridge regression model (alpha=0.5) the test set: \n' f'{ridge_reg_test_error.mean():.3f} +/- {ridge_reg_test_error.std():.3f}')
Mean absolute error of ridge regression model (alpha=0.5) the train set: 0.481 +/- 0.003 Mean absolute error of ridge regression model (alpha=0.5) the test set: 0.487 +/- 0.006
Hyperparameter tuning for ridge regularization rate¶
alpha_list = np.logspace(-4, 0, num=20)
ridge_reg_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('feature_scaling', StandardScaler()),
('ridge_cv', RidgeCV(alphas=alpha_list, cv=cv, scoring='neg_mean_absolute_error'))
])
ridge_reg_cv_results = ridge_reg_pipeline.fit(com_train_features, com_train_labels)
print('The score with the best alpha is :',
f'{ridge_reg_cv_results[-1].best_score_:.3f}')
print('The error with the best alpha is :',
f'{-ridge_reg_cv_results[-1].best_score_:.3f}')
print('The best value for alpha :', ridge_reg_cv_results[-1].alpha_)
The score with the best alpha is : -0.473 The error with the best alpha is : 0.473 The best value for alpha : 0.007847599703514606
Ridge HPT through GridSearchCV¶
ridge_grid_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('feature_scaling', StandardScaler()),
('ridge', Ridge())])
param_grid = {'poly__degree': (1, 2, 3),
'ridge__alpha': np.logspace(-4, 0, num=20)}
ridge_grid_search = GridSearchCV(ridge_grid_pipeline,
param_grid=param_grid,
n_jobs=-1,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True)
ridge_grid_search.fit(com_train_features, com_train_labels)
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('ridge', Ridge())]),
n_jobs=-1,
param_grid={'poly__degree': (1, 2, 3),
'ridge__alpha': array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00])},
return_train_score=True, scoring='neg_mean_absolute_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('ridge', Ridge())]),
n_jobs=-1,
param_grid={'poly__degree': (1, 2, 3),
'ridge__alpha': array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00])},
return_train_score=True, scoring='neg_mean_absolute_error')Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()), ('ridge', Ridge())])PolynomialFeatures()
StandardScaler()
Ridge()
ridge_grid_search.best_index_ gives the index of the best parameter in the list.
# best parameter
ridge_grid_search.cv_results_['params'][ridge_grid_search.best_index_]
{'poly__degree': 2, 'ridge__alpha': 0.007847599703514606}
mean_train_error = -1 * ridge_grid_search.cv_results_[
'mean_train_score'][ridge_grid_search.best_index_]
mean_test_error = -1 * ridge_grid_search.cv_results_[
'mean_test_score'][ridge_grid_search.best_index_]
std_train_error = -1 * ridge_grid_search.cv_results_[
'std_train_score'][ridge_grid_search.best_index_]
std_test_error = -1 * ridge_grid_search.cv_results_[
'std_test_score'][ridge_grid_search.best_index_]
print(f'Best Mean absolute error of polynomial ridge regression model on the train set:\n' f"{mean_train_error:.3f} +/- {std_train_error:.3f}")
print()
print(f'Best Mean absolute error of polynomial ridge regression model on the test set:\n' f"{mean_test_error:.3f} +/- {std_test_error:.3f}")
Best Mean absolute error of polynomial ridge regression model on the train set: 0.463 +/- -0.004 Best Mean absolute error of polynomial ridge regression model on the test set: 0.474 +/- -0.015
print('Mean cross validated score of the best estimator is : ',
ridge_grid_search.best_score_)
print('Mean cross validated error of the best estimator is : ', -
ridge_grid_search.best_score_)
Mean cross validated score of the best estimator is : -0.4738651177003218 Mean cross validated error of the best estimator is : 0.4738651177003218
ridge_grid_search.best_estimator_
Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('ridge', Ridge(alpha=0.007847599703514606))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('ridge', Ridge(alpha=0.007847599703514606))])PolynomialFeatures()
StandardScaler()
Ridge(alpha=0.007847599703514606)
5. Lasso Regression¶
lasso_reg_pipeline = Pipeline([
('poly',PolynomialFeatures(degree=2)),
('feature_scaling',StandardScaler()),
('lasso',Lasso(alpha=0.001))
])
lasso_reg_cv_results = cross_validate(lasso_reg_pipeline,
com_train_features,
com_train_labels,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True )
lasso_reg_train_error = -1 * lasso_reg_cv_results['train_score']
lasso_reg_test_error =-1 * lasso_reg_cv_results['test_score']
print(f'Mean absolute error of linear regression model on the train set : \n' f'{lasso_reg_train_error.mean():.3f} +/- {lasso_reg_train_error.std():.3f}')
print(f'Mean absolute error of linear regression model on the test set : \n' f'{lasso_reg_test_error.mean():.3f} +/- {lasso_reg_test_error.std():.3f}')
Mean absolute error of linear regression model on the train set : 0.503 +/- 0.008 Mean absolute error of linear regression model on the test set : 0.512 +/- 0.015
Lasso Regression with GridSearchCV¶
lasso_grid_pipeline =Pipeline([
('poly',PolynomialFeatures()),
('feature_scaling',StandardScaler()),
('lasso',Lasso())])
param_grid ={"poly__degree": (1,2,3),
"lasso__alpha": np.logspace(-4,0, num=20)}
lasso_grid_search = GridSearchCV(lasso_grid_pipeline,
param_grid=param_grid, n_jobs=2,
cv =cv,
scoring='neg_mean_absolute_error',
return_train_score = True)
lasso_grid_search.fit(com_train_features, com_train_labels)
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('lasso', Lasso())]),
n_jobs=2,
param_grid={'lasso__alpha': array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00]),
'poly__degree': (1, 2, 3)},
return_train_score=True, scoring='neg_mean_absolute_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()),
('lasso', Lasso())]),
n_jobs=2,
param_grid={'lasso__alpha': array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00]),
'poly__degree': (1, 2, 3)},
return_train_score=True, scoring='neg_mean_absolute_error')Pipeline(steps=[('poly', PolynomialFeatures()),
('feature_scaling', StandardScaler()), ('lasso', Lasso())])PolynomialFeatures()
StandardScaler()
Lasso()
mean_train_error = -1 * lasso_grid_search.cv_results_['mean_train_score'][lasso_grid_search.best_index_]
mean_test_error = -1 * lasso_grid_search.cv_results_['mean_test_score'][lasso_grid_search.best_index_]
std_train_error = -1 * lasso_grid_search.cv_results_['std_train_score'][lasso_grid_search.best_index_]
std_test_error = -1 * lasso_grid_search.cv_results_['std_test_score'][lasso_grid_search.best_index_]
print(f'Best Mean absolute error of polynomial lasso regression model on the train set : \n' f"{mean_train_error:.3f} +/- {std_train_error:.3f}")
print(f'Best Mean absolute error of polynomial lasso regression model on the test set : \n' f"{mean_test_error:.3f} +/- {std_test_error:.3f}")
Best Mean absolute error of polynomial lasso regression model on the train set : 0.462 +/- -0.003 Best Mean absolute error of polynomial lasso regression model on the test set : 0.488 +/- -0.046
Comparision of weight vectors¶
Let's look at the weight vectors produced by different models.
1. Polynomial Regression with CV¶
feature_names = poly_reg_cv_results["estimator"][0][0].get_feature_names_out(
input_features=train_features.columns)
print(feature_names)
['1' 'MedInc' 'HouseAge' 'AveRooms' 'AveBedrms' 'Population' 'AveOccup' 'Latitude' 'Longitude' 'MedInc HouseAge' 'MedInc AveRooms' 'MedInc AveBedrms' 'MedInc Population' 'MedInc AveOccup' 'MedInc Latitude' 'MedInc Longitude' 'HouseAge AveRooms' 'HouseAge AveBedrms' 'HouseAge Population' 'HouseAge AveOccup' 'HouseAge Latitude' 'HouseAge Longitude' 'AveRooms AveBedrms' 'AveRooms Population' 'AveRooms AveOccup' 'AveRooms Latitude' 'AveRooms Longitude' 'AveBedrms Population' 'AveBedrms AveOccup' 'AveBedrms Latitude' 'AveBedrms Longitude' 'Population AveOccup' 'Population Latitude' 'Population Longitude' 'AveOccup Latitude' 'AveOccup Longitude' 'Latitude Longitude']
print(poly_reg_cv_results['estimator'][0][-1])
LinearRegression()
coefs = [i[-1].coef_ for i in poly_reg_cv_results["estimator"]]
print(coefs[:2])
[array([-1.20413427e-12, -1.61569371e+01, -1.21092383e+01, 1.11235836e+01,
-9.44654938e+00, -2.34941156e+00, 7.36635757e+01, 1.63563849e+00,
-4.68455430e-01, 1.12528038e-01, -1.63804130e-01, 2.05049845e-01,
2.99114714e-01, 1.98368002e+00, -8.06864364e+00, -2.47640735e+01,
-2.20667537e-01, 3.51848996e-01, 5.66131647e-02, -1.71127167e+00,
-5.42873233e+00, -1.75752779e+01, -4.24679503e-02, -3.52790487e-01,
-1.84145609e-01, 6.71628693e+00, 1.72961794e+01, 5.20628084e-01,
5.12438573e-01, -5.63236511e+00, -1.44389342e+01, 1.30559891e+00,
-5.26771610e-01, -2.35921682e+00, 3.51078155e+01, 1.10727914e+02,
2.02753888e+00]), array([-8.21282625e-14, -1.70030899e+01, -1.17430112e+01, 1.22404452e+01,
-9.31590879e+00, -1.09418936e+00, 2.49354245e+00, 1.03838098e+00,
4.18560276e-02, 9.80811903e-02, -1.17108267e-01, 1.96625060e-01,
3.77107433e-01, -9.87223928e-01, -8.00160924e+00, -2.57487308e+01,
-1.59828460e-01, 3.13816437e-01, 6.37042527e-02, -1.50455340e+00,
-5.34177051e+00, -1.71678652e+01, -4.12497156e-02, -4.62865867e-01,
1.24184548e+00, 6.32264153e+00, 1.82827959e+01, 7.44399297e-01,
-8.48692820e-01, -4.62758747e+00, -1.35225103e+01, 1.77804564e+00,
4.51793935e-01, 1.82027262e-01, 8.97461836e+00, 1.10844152e+01,
1.07964837e+00])]
weights_poly_df = pd.DataFrame(coefs, columns=feature_names)
color = {'whiskers': 'black', 'medians': 'green', 'caps': 'blue'}
weights_poly_df.plot.box(color=color, vert=False, figsize=(12, 12))
plt.title('Polynomial regression coefficients')
plt.grid()
plt.show()
2. Ridge Regression with CV¶
ridge_reg_pipeline = Pipeline([
('poly', PolynomialFeatures(degree=2)),
('feature_scaling', StandardScaler()),
('ridge', Ridge(alpha=0.5))])
ridge_reg_cv_results = cross_validate(ridge_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
feature_names = ridge_reg_cv_results['estimator'][0][0].get_feature_names_out(
input_features=train_features.columns)
feature_names
array(['1', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population',
'AveOccup', 'Latitude', 'Longitude', 'MedInc^2', 'MedInc HouseAge',
'MedInc AveRooms', 'MedInc AveBedrms', 'MedInc Population',
'MedInc AveOccup', 'MedInc Latitude', 'MedInc Longitude',
'HouseAge^2', 'HouseAge AveRooms', 'HouseAge AveBedrms',
'HouseAge Population', 'HouseAge AveOccup', 'HouseAge Latitude',
'HouseAge Longitude', 'AveRooms^2', 'AveRooms AveBedrms',
'AveRooms Population', 'AveRooms AveOccup', 'AveRooms Latitude',
'AveRooms Longitude', 'AveBedrms^2', 'AveBedrms Population',
'AveBedrms AveOccup', 'AveBedrms Latitude', 'AveBedrms Longitude',
'Population^2', 'Population AveOccup', 'Population Latitude',
'Population Longitude', 'AveOccup^2', 'AveOccup Latitude',
'AveOccup Longitude', 'Latitude^2', 'Latitude Longitude',
'Longitude^2'], dtype=object)
coefs = [i[-1].coef_ for i in ridge_reg_cv_results["estimator"]]
print(coefs[:1])
[array([ 0. , -2.28217749, -2.67967218, -0.58591007, 1.39039817,
-0.25786929, -2.50666484, 0.34514535, -1.90285395, -0.55366407,
0.30043979, 0.53858318, -0.34636673, 0.367585 , 0.92303558,
-2.5725778 , -5.77825751, 0.19560824, -0.30716804, 0.36148585,
0.08411846, -1.15535184, -1.40903272, -3.8157232 , 1.35141026,
-1.92965604, -0.4543164 , 0.31277516, 1.59976011, 1.28631915,
0.58874961, 0.71988977, 0.18769236, -0.93240286, 0.26887513,
0.02677913, 2.24863483, 0.62536881, 1.13525363, -0.0989156 ,
1.9383021 , 1.89213638, 2.09354319, 3.82488629, -0.407648 ])]
weights_ridge_df = pd.DataFrame(coefs, columns=feature_names)
color = {'whiskers': 'black', 'medians': 'green', 'caps': 'blue'}
weights_ridge_df.plot.box(color=color, vert=False, figsize=(12, 12))
plt.title('Ridge regression coefficients')
plt.grid()
plt.show()
Comparing Performance on test set¶
1. Baseline Model¶
baseline_model_median = DummyRegressor(strategy='median')
baseline_model_median.fit(train_features, train_labels)
mean_absolute_percentage_error(test_labels, baseline_model_median.predict(test_features))
0.5348927548151625
2. Linear Regression with normal equation¶
mean_absolute_percentage_error(test_labels ,lin_reg_cv_results['estimator'][0].predict(test_features))
0.32120472175482895
3. SGD regression with randomsearchCV¶
mean_absolute_percentage_error(test_labels ,poly_sgd_random_search_cv.best_estimator_.predict(test_features))
0.31523619087395877
4. Polynomial Regression¶
poly_reg_pipeline.fit(com_train_features ,com_train_labels)
mean_absolute_percentage_error(test_labels ,poly_reg_pipeline.predict(test_features))
0.28199759082657094
5. Lasso Regression¶
mean_absolute_percentage_error(test_labels ,lasso_grid_search.best_estimator_.predict(test_features))
0.280749692638101
6. Ridge Regression¶
mean_absolute_percentage_error(test_labels ,ridge_grid_search.best_estimator_.predict(test_features))
0.2711033645132435
Introduction¶
Over the past four weeks we explored various data preprocessing techniques and solved some regression problems using linear and logistic regression models. The other side of the supervised learning paradigm is classification problems.
To solve such problems we are going to consider image classification as a running example and solving it using Perceptron() method.
Imports¶
For classification problems, we need to import classes and utilities from sklearn.linear_model.
This module has implementations for different classification models like
Perceptron, LogisticRegression, svmandknn.We also need to import a bunch of model selection utilities from
sklearn.model_selectionmodule and metrics fromsklearn.metricsmodule.The data preprocessing utilities are imported from
sklearn.preprocessingmodules.
# Common imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns ;sns.set()
import os
import io
import warnings
# sklearn specific imports
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import hinge_loss
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV
from pprint import pprint
Handwritten Digit Classification¶
We are going to use Perceptron Classifier to classify (recognize) given digit images.
Since a single perceptron could only be used for binary classification, we consider only two classes in the first half. Eventually we will extend it to a multi-class setting.
Suppose we want to recognize whether the given image is of digit zero or not (digit other than zero). Then the problem could be cast as a binary classification problem.
The first step is to create a dataset that contains a collection of digit images (also called examples, samples) written by humans. Then each image should be labelled properly.
Data Loading¶
# returns Data and Label as a pandas dataframe
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
The data matrix $X$ and the respective label vector $ y$ need to be converted to the numpy array by calling a to_numpy method.
X = X.to_numpy()
y = y.to_numpy()
Let's get some information like number of features, number of classes about the dataset.
Observe that the labels are of string data type not integers.
target_names = np.unique(y)
print('Number of samples : {0}, type : {1}'.format(X.shape[0], X.dtype))
print('Number of features : {0}'.format(X.shape[1]))
Number of samples : 70000, type : float64 Number of features : 784
print('Minimum : ', np.min(X))
print('Maximum : ', np.max(X))
Minimum : 0.0 Maximum : 255.0
print('Number of classes : {0},type :{1}'.format(len(target_names), y.dtype))
print('Labels : {0}'.format(target_names))
Number of classes : 10,type :object Labels : ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
The MNIST dataset is clean and the range of values that each feature can take is also known. Therefore, the samples in the dataset may not require many data preprocessing techniques.
However, it is often better to scale the range of features between 0 to 1.
So, we can either use
MinMaxScalerorMaxAbsScaler. They don't make any difference as the image pixels can takes only positive value from 0 to 255.
X = MinMaxScaler().fit_transform(X)
print('Minimum : ', np.min(X))
print('Maximum : ', np.max(X))
Minimum : 0.0 Maximum : 1.0
Data Visualization¶
Let us pick a few images (the images are already shuffled in the dataset) and display them with their respective labels.
As said above, the images are stacked as a row vector of size $ 1 \times 784$ and therefore must be reshaped to the matrix of size $ 28 \times 28$ to display them properly.
# Choose a square number
num_images = 9
factor = int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols=factor,figsize=(8,6))
# take "num_images" starting from the index "idx_offset"
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(X[index+j].reshape(28,28), cmap='gray')
ax[i,j].set_title('Label : {0}'.format(str(y[index+j])))
ax[i,j].set_axis_off()
If we closely observe, we can see that there are moderate variations in the appearance of digits (ex: digit 1).
These matrices also close to sparse (i.e, there are lots of zero / black pixels in the matrix than non-zero pixels)
Data Splitting¶
Now, we know the details such as number of samples, size of each sample, number of features (784), number of classes (targets) about the dataset.
So let's split the total number of samples into train and test set in the following ratio : 60000/10000 (i.e 60000 samples in the training set and 10000 samples in the testing set).
Since the samples in the data set are already randomly shuffled, we need not to shuffle it again. Therefore using
train_test_split()may be skipped.
Binary Classification : 0-Detector¶
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
Handling Imbalanced Data¶
Before proceeding further, we need to check whether the dataset is balanced or imbalanced.
We can do it by plotting the distribution of samples in each classes.
plt.figure(figsize=(10, 4))
sns.histplot(data=np.int8(y_train), binwidth=0.45, bins=11)
plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.xlabel('Class')
plt.title('Distribution of Samples')
plt.show()
Modifying Labels¶
Let us start with a simple classification problem, i.e binary classification.
Since the original label vector contains 10 classes, we need to modify the number of classes to 2.
Therefore, the label 0 will be changed 1 and all the other labels (1-9) will be changed to -1.
We will name the label vectors as
y_train_0andy_test_0.
# initialize new variable names with all -1
y_train_0 = -1*np.ones(len(y_train))
y_test_0 = -1*np.ones(len(y_test))
# find indices of digit 0 image
indx_0 = np.where(y_train=='0')
# remember original labels are of type str not int, so use those indices to modify y_train_0 & y_test_0
y_train_0[indx_0] = 1
indx_0 = np.where(y_test =='0')
y_test_0[indx_0] = 1
Sanity check :¶
Let's display the elements of y_train and y_train_0 to verify whether the labels are properly modified.
# Choose a square number
num_images = 9
factor = int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols=factor,figsize=(8,6))
# take "num_images" starting from the index "idx_offset"
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(X[index+j].reshape(28,28), cmap='gray')
ax[i,j].set_title('Label : {0}'.format(str(y_train_0[index+j])))
ax[i,j].set_axis_off()
Model¶
Baseline Models¶
Let us quickly construct a baseline model with the following rule : (you are free to choose different rules)
Count number of samples per class.
The model always outputs the class which has highest number of samples.
Then calculate the accuracy of the baseline model.
num_pos = len(np.where(y_train_0 == 1)[0])
num_neg = len(np.where(y_train_0 == -1)[0])
print(num_pos)
print(num_neg)
5923 54077
base_clf = DummyClassifier(strategy='most_frequent')
base_clf.fit(X_train ,y_train_0)
print('Training accuracy : {0:.4f}'.format(base_clf.score(X_train, y_train_0)))
print('Testing accuracy : {0:.4f}'.format(base_clf.score(X_test,y_test_0)))
Training accuracy : 0.9013 Testing accuracy : 0.9020
Now the reason is obvious. The model would have predicted 54077 samples correctly just by outputing -1 for all the input samples.
Therefore, the accuracy will be simply : $ \frac{54077}{60000} = 0.90128 $
This is the reason why "accuracy" alone is not always a good measure!.
Perceptron Model¶
Quick recap of various components in the general settings:
1. Training data¶
consists of features & labels or $(\mathbf X,y)$
Here, $y$ is a discrete number from a finite set.
Features in this case are pixel values of an image.
2. Model :¶
\begin{align} h_w:y&=&\text g(\mathbf w^T \mathbf x) \\ &=&\text g(w_0+w_1x_1+\ldots + w_mx_m) \end{align}
where,
$\mathbf w$ is weight vector in $\mathbb{R}^{(m+1)}$ i.e. it has components : $\{w_0,w_1,\ldots,w_m\}$
g(z) is a non-linear activation function given by a sign function:
$$\text g(z)=\begin{cases} +1 ,\text {if} \ z \ge 0 \\ -1, \text {otherwise}(i.e. z \lt 0)\end{cases}$$
3. Loss function :¶
Let $ {\hat y}^{(i)} \in \{-1,+1\}$ be the prediction from perceptron and ${\hat y}^{(i)}$ be the actual label for $i-\text{th}$ example. $ \\ $
The error is :
$$\text e^{(i)}=\begin{cases} 0 , \ \ \text { if} \ \ {\hat y}^{(i)} = y^{(i)} \\ -\mathbf {w^Tx^{(i)}}y^{(i)}, \text { otherwise} (i.e. {\hat y}^{(i)} \ne y^{(i)})\end{cases}$$
This can be compactly written as:
\begin{equation} e^{(i)}=\max(-\mathbf{w^Tx^{(i)}}y^{(i)},0)=\max(-h_{\text w }(\mathbf x^{(i)})y^{(i)},0) \end{equation}
4. Optimization :¶
Perceptron learning algorithm :
Initialize $\mathbf {\text w}^{(0)}=0$
For each training example $(x^{(i)},y^{(i)})$
${\hat y}^{(i)}=\text{sign}(\mathbf {w^Tx}^{(i)})[\text {Calculates the output value}]$
$\mathbf w^{(t+1)} := \mathbf w^{(t)}+ \alpha (y^{(i)}-{\hat y}^{(i)})\mathbf x^{(i)}[\text{Updates the weights}] $
IMP : Linearly separable examples lead to convergence of the algorithm with zero training loss, else it oscillates.
Parameters of Perceptron Class¶
- Let's quickly take a look into the important parameters of the Perceptron()
class sklearn.linear_model.Perceptron (*,penalty=None, alpha = 0.0001, l1_ratio=0.15, fit_intercept = True, max_iter=1000,tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5,class_weight=None, warm_start=False).
We need not to pay attention to all the arguments and their default values.
Internally, the API uses the perceptron loss (i.e. it calls Hinge(0,0), where 0.0 is a threshold) and uses SGD to update the weights.
The other way of deploying perceptron is to use the general
linear_model.SGDClassifierwithloss='perceptron'The above loss is termed as hard Hinge-loss (as scores pass through the sign function) and hence we can't use SGD.
Whereas, sklearn implements hinge-loss with the following definition: $\max (0,-wx^iy^i$) and by default calls SGD to minimize the loss.
Instantiation¶
Create an instantiation of binary classifier (bin_clf).
bin_clf = Perceptron(max_iter=100,random_state=1729)
Training and Prediction¶
Call the
fitmethod to train the model.It would be nice to plot the iteration vs loss curve for the training. However, sklearn does not have a direct function to plot it.
Nevertheless, we can workaround this using
partial_fitmethod (explained later)
bin_clf.fit(X_train, y_train_0)
print('Dimension of Weights : {0}'.format(bin_clf.coef_.shape))
print('Bias : {0}'.format(bin_clf.intercept_))
print('Loss function : {0}'.format(bin_clf.loss_function_))
Dimension of Weights : (1, 784) Bias : [-108.] Loss function : <sklearn.linear_model._sgd_fast.Hinge object at 0x0000016D2BEF3570>
Let us make predictions on the training set and then calculate the training accuracy.
y_hat_train_0 = bin_clf.predict(X_train)
print('Training Accuracy :', bin_clf.score(X_train,y_train_0))
Training Accuracy : 0.99095
Let us make the predictions on the test set and then calculate the testing accuracy.
print('Test accuracy :',bin_clf.score(X_test,y_test_0))
Test accuracy : 0.989
Displaying Predictions¶
Take few images from the test-set at random and display it with the corresponding predictions.
Plot a few images in a single figure window along with their respective Predictions.
y_hat_test_0 = bin_clf.predict(X_test)
num_images = 9
factor = int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols = factor, figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(X_test[index+j].reshape(28,28),cmap='gray')
ax[i,j].set_title('Prediction: {0}'.format(str(y_hat_test_0[index+j])))
ax[i,j].set_axis_off()
indx_0 = np.where(y_test_0==1)
zeroImgs = X_test[indx_0[0]]
zeroLabls = y_hat_test_0[indx_0[0]]
num_images = 9
factor = int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows=factor, ncols=factor, figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(zeroImgs[index+j].reshape(28,28),cmap='gray')
ax[i,j].set_title('Prediction : {0}'.format(str(zeroLabls[index+j])))
ax[i,j].set_axis_off()
It seems that there are a significant number of images that are correctly classified.
num_misclassified = np.count_nonzero(zeroLabls == -1)
num_correctclassified = len(zeroLabls) - num_misclassified
accuracy = num_correctclassified / len(zeroLabls)
print(accuracy)
0.9193877551020408
This above score is less than the accuracy score of the model but it seems preety descent.
Will it be the same if we consider another digit, say, 5 for positive class and all other class as negative. Of course not.
Better Evaluation metrics¶
- We now know that using the accuracy alone to measure the performance of the model is not suitable (especially for imbalanced datasets).
1. Confusion Matrix¶
y_hat_train_0 = bin_clf.predict(X_train)
cm_display = ConfusionMatrixDisplay.from_predictions(
y_train_0, y_hat_train_0, values_format='.5g', display_labels=bin_clf.classes_)
plt.show()
Pay attention to the number of FPs and FNs. Suppose for some reasons, we want the classifer to avoid FPs to a good extent irrespective of FNs, how can we acheive it.
To answer it, let's compute the other metrics which take FPs and FNs into account.
2. Precision & Recall¶
We can use the function classification_report to compute these parameters.
However, for the time being let's compute these parameters using the data from the confusion matrix manually.
cf_matrix = cm_display.confusion_matrix
tn = cf_matrix[0,0]
fn = cf_matrix[1,0]
fp = cf_matrix[0,1]
tp = cf_matrix[1,1]
precision = tp/(tp+fp)
print('Precision : ', precision)
recall = tp/(tp+fn)
print('Recall : ', recall)
accuracy = (tn+tp)/(tn+tp+fn+fp)
print('Accuracy : ', accuracy)
Precision : 0.9783072546230441 Recall : 0.9289211548201924 Accuracy : 0.99095
Precision is close to 0.98. Despite it, we still want to increase the precision.
In general, we would like to know whether the model under consideration with the set hyper-parameters is a good one for a given problem.
Cross validation (CV)¶
Well to address this, we have to use cross-validation folds and measure the same metrics across these folds for different values of hyperparameters.
However, perceptron doesn't have many hyperparameters other than the learning rate.
For the moment, we set the learning rate to its default value. Later, we will use
GridSearchCVto find the better value for the learning rate.
Generalization
bin_clf = Perceptron(max_iter=100, random_state=1729)
scores = cross_validate(bin_clf, X_train, y_train_0, cv=5, scoring=[
'precision', 'recall', 'f1'], return_estimator=True)
print(scores)
{'fit_time': array([0.80022049, 1.47012854, 1.06001902, 0.94003081, 1.28021407]), 'score_time': array([0.02770138, 0.02423024, 0.02049136, 0.02981305, 0.02924967]), 'estimator': [Perceptron(max_iter=100, random_state=1729), Perceptron(max_iter=100, random_state=1729), Perceptron(max_iter=100, random_state=1729), Perceptron(max_iter=100, random_state=1729), Perceptron(max_iter=100, random_state=1729)], 'test_precision': array([0.95890411, 0.98828125, 0.95319149, 0.95090439, 0.96200345]), 'test_recall': array([0.94594595, 0.85472973, 0.94514768, 0.93164557, 0.94008439]), 'test_f1': array([0.95238095, 0.91666667, 0.94915254, 0.94117647, 0.95091763])}
NOTE :
The perceptron estimator passed as an argument to the function cross_validate is internally cloned num_fold (cv=5) times and fitted independently on each fold. (you can check this by setting warm_start=True)
Compute the average and standard deviation of scores for all three metrics on (k=5) folds to measure the generalization!.
print('Precision : avg : {0:.2f}, std : {1:.2f}'.format(
scores['test_precision'].mean(), scores['test_precision'].std()))
print()
print('Recall : avg : {0:.2f}, std : {1:.2f}'.format(
scores['test_recall'].mean(), scores['test_recall'].std()))
print()
print('F1 score : avg : {0:.2f}, std : {1:.3f}'.format(
scores['test_f1'].mean(), scores['test_f1'].std()))
Precision : avg : 0.96, std : 0.01 Recall : avg : 0.92, std : 0.03 F1 score : avg : 0.94, std : 0.013
Let us pick the first estimator returned by the cross-validate function.
So, we can hope that the model might also perform well on test data.
bin_clf = scores['estimator'][0]
y_hat_test_0 = bin_clf.predict(X_test)
cm_display = ConfusionMatrixDisplay.from_predictions(
y_test_0, y_hat_test_0, values_format='.5g')
print('Precision : {0:.4f}'.format(precision_score(y_test_0, y_hat_test_0)))
print('Recall : {0:.4f}'.format(recall_score(y_test_0, y_hat_test_0)))
Precision : 0.9547 Recall : 0.9469
This is good !
Another way for 'Generalization' (Optional)
There is an another approach of getting predicted labels via cross-validation and using it to measure the generalization.
In this case, each sample in the dataset will be part of only one test set in the splitted folds.
y_hat_train_0 = cross_val_predict(bin_clf, X_train, y_train_0, cv=5)
cm_display = ConfusionMatrixDisplay.from_predictions(
y_train_0, y_hat_train_0, values_format='.5g')
plt.show()
cf_matrix = cm_display.confusion_matrix
tn = cf_matrix[0,0]
fn = cf_matrix[1,0]
fp = cf_matrix[0,1]
tp = cf_matrix[1,1]
precision = tp/(tp+fp)
print('Precision : ', precision)
recall = tp/(tp+fn)
print('Recall : ', recall)
accuracy = (tn+tp)/(tn+tp+fn+fp)
print('Accuracy : ', accuracy)
Precision : 0.9620119591980303 Recall : 0.9235184872530812 Accuracy : 0.98885
Compare the precision and recall score obtained by the above method with that of the previous method (i.e. using cross_validate)
print('Precision : {0:.4f}'.format(precision_score(y_train_0,y_hat_train_0)))
print('Recall : {0:.4f}'.format(recall_score(y_train_0,y_hat_train_0)))
Precision : 0.9620 Recall : 0.9235
Finally, we can print all these scores as a report using the classification_report function
print(classification_report(y_train_0,y_hat_train_0))
precision recall f1-score support
-1.0 0.99 1.00 0.99 54077
1.0 0.96 0.92 0.94 5923
accuracy 0.99 60000
macro avg 0.98 0.96 0.97 60000
weighted avg 0.99 0.99 0.99 60000
3. Precision / Recall Tradeoff¶
Often time we need to make a trade off between precision and recall scores of a model.
It depends on the problem at hand.
It is important to note that we should not pass the predicted labels as input to
precision_recall_curvefunction, instead we need to pass the probability scores or the output from the decision function!.The
Perceptron()class contains adecision_functionmethod, therefore we can make use of it.Then, internally the decision scores are sorted, tps and fps will be computed by changing the threshold from index[0] to index [-1].
Let us compute the scores from decision function.
bin_clf = Perceptron(random_state=1729)
bin_clf.fit(X_train, y_train_0)
y_scores = bin_clf.decision_function(X_train)
sns.histplot(np.sort(y_scores))
plt.show()
The reason for so many negative values than the positives is : Class-Imbalance.
Suppose threshold takes the value of -600, then all the samples having score greater than -600 is set to 1 ( +ve label ) and less than it is set to -1 ( -ve label ).
Therefore, the number of False Positives will be increased. This will in turn reduce the precision score to a greater extent.
On the otherhand, if the threshold takes the value of say 400, Then, the number of False negatives will be increase and hence the recall will reduce to a greater extent.
precisions, recalls, thresholds = precision_recall_curve(y_train_0,y_scores,pos_label=1)
plt.figure(figsize=(10, 6))
plt.plot(precisions[:-1], recalls[:-1], "g--")
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 4))
plt.plot(thresholds, precisions[:-1], "b--", label='Precision')
plt.plot(thresholds, recalls[:-1], "g-", label='Recall')
plt.xlabel('Threshold')
plt.grid(True)
plt.legend(loc='best')
plt.show()
Getting the index of threshold around zero
idx_th = np.where(np.logical_and(thresholds >0, thresholds <1))
print('Precision for zero threshold : ',precisions[idx_th[0][0]])
Precision for zero threshold : 0.9783072546230441
- The solution to the question of how can we increase the precision of the classifier by compromising the recall is we can make use of the above plot.
4. ROC Curve¶
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_0, y_scores)
plt.figure(figsize=(10, 4))
plt.plot(fpr, tpr, linewidth=2, label='Perceptron')
plt.plot([0, 1], [0, 1], 'k--', label='baseEstimator')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend()
plt.show()
Warm Start VS Cold Start¶
Cold Start¶
If we execute the
fitmethod ofbin_clfrepeatedly, we get the same score for both training and testing accuracy.This because everytime the
fitmethod is called, the model weights are initialized to the same values. Therefore, we obtain the same score.This is termed as cold start.
Let's execute the following cell 4 times and observe the score.
bin_clf.fit(X_train, y_train_0)
y_hat_train_0 = bin_clf.predict(X_train)
print('Training Accuracy : ', bin_clf.score(X_train, y_train_0))
print('Test accuracy : ', bin_clf.score(X_test, y_test_0))
Training Accuracy : 0.99095 Test accuracy : 0.989
Warm Start¶
Setting
warm_start=Trueretains the weight values of the model aftermax_iterand hence produce different results for each execution.Warm starting is useful in many ways. It helps us train the model by initializing the weight values from the previous state. So, we can pause the training and resume it whenever we get the resource for computation.
Of course, it is not required for simple models like perceptron and for a small dataset like MNIST.
In this notebook, we use this feature to plot the iteratation vs loss curve.
Let us execute the following lines of code 4 times and observe how the training accuracy changes for each execution.
bin_clf_warm = Perceptron(max_iter=100,random_state=1729,warm_start=True)
bin_clf_warm.fit(X_train,y_train_0)
print('Training Accuracy : ', bin_clf_warm.score(X_train,y_train_0))
Training Accuracy : 0.99095
Multiclass Classification (OneVsAll)¶
We know that the perceptron is a binary classifier. However, MNIST dataset contains 10 classes. So, we need to extend the idea to handle multi-class problem.
Solution : Combining multiple binary classifiers and devise a suitable scoring metric.
Sklearn makes it extremely easy without modifying a single line of code that we have written for the binary classifier.
Sklearn does this by counting a number of unique elements (10 in this case) in the label vector
y_trainand converting labels usingLabelbinarizerto fit each binary classifier.
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import LabelBinarizer
Let's use Label binarizer just to see the encoding.
y_train_ovr = LabelBinarizer().fit_transform(y_train)
for i in range(10):
print('{0} : {1}'.format(y_train[i],y_train_ovr[i]))
5 : [0 0 0 0 0 1 0 0 0 0] 0 : [1 0 0 0 0 0 0 0 0 0] 4 : [0 0 0 0 1 0 0 0 0 0] 1 : [0 1 0 0 0 0 0 0 0 0] 9 : [0 0 0 0 0 0 0 0 0 1] 2 : [0 0 1 0 0 0 0 0 0 0] 1 : [0 1 0 0 0 0 0 0 0 0] 3 : [0 0 0 1 0 0 0 0 0 0] 1 : [0 1 0 0 0 0 0 0 0 0] 4 : [0 0 0 0 1 0 0 0 0 0]
The
y_train_ovrwill be of size of size $60000 \times 10$.The first column will be (binary) label vector for 0-detector and the next one for 1-Detector and so on.
clf = Perceptron(random_state=1729)
clf.fit(X_train,y_train)
Perceptron(random_state=1729)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Perceptron(random_state=1729)
What had actually happened internally was that the API automatically created 10 binary classifiers, converted labels to binary sparse matrix and trained them with the binarized labels.
During the inference time, the input will be passed through all these 10 classifiers and the highest score among the output from the classifiers will be considered as the predicted class.
To see it in action, let us execute the following lines of code.
print('Shape of Weight matrix : {0} and bias vector : {1}'.format(
clf.coef_.shape, clf.intercept_.shape))
Shape of Weight matrix : (10, 784) and bias vector : (10,)
So it is a matrix of size $ 10 \times 784 $, where each row represents the weights for a single binary classifier.
Important difference to note is that there is no signum function associated with the perceptron.
The class of a perceptron that outputs the maximum score for the input sample is considered as the predicted class.
for i in range(10):
scores = clf.decision_function(X_train[i].reshape(1, -1))
print(scores)
print()
print('The predicted class : ', np.argmax(scores))
print()
print('Predicted output : ')
print(clf.predict(X_train[i].reshape(1, -1)))
print('-'*20)
[[-281.59403306 -316.86757401 -163.89196463 -29.53201077 -360.15901576
35.74422145 -281.75632449 -129.84598231 -269.37986928 -232.78886582]]
The predicted class : 5
Predicted output :
['5']
--------------------
[[ 163.56542868 -383.99440215 -217.57545559 -208.67775471 -341.63012687
-200.703391 -272.04836601 -265.31963091 -205.48413687 -254.29404075]]
The predicted class : 0
Predicted output :
['0']
--------------------
[[-333.95381776 -222.07201845 -112.63298731 -31.12224529 41.19340254
-231.70143791 -294.13151865 -140.55028066 -336.50911188 -242.68656671]]
The predicted class : 4
Predicted output :
['4']
--------------------
[[-246.03221838 87.48547482 -63.78459054 -88.46945021 -171.87637063
-160.67975394 -243.45420992 -158.6293887 -130.2792772 -255.97957709]]
The predicted class : 1
Predicted output :
['1']
--------------------
[[-361.45457901 -208.39195694 -322.6535025 -227.38734333 -109.59223376
-165.16824298 -231.39936947 -134.96336794 -141.13650135 -4.26698962]]
The predicted class : 9
Predicted output :
['9']
--------------------
[[-213.98914264 -332.31924644 88.32167628 -160.22043829 -331.30016148
-184.67700115 -317.99512495 -345.58203768 -166.98514418 -90.70019223]]
The predicted class : 2
Predicted output :
['2']
--------------------
[[-631.88492118 154.366213 -65.23234141 -90.95338716 -189.29568627
-137.14385236 -99.64604383 -159.09859285 -136.86391388 -199.26120723]]
The predicted class : 1
Predicted output :
['1']
--------------------
[[-266.22125336 -348.16362937 -122.84601307 86.8302499 -285.90316032
-160.36818147 -564.14554402 -338.4272203 -67.1555248 -154.20672049]]
The predicted class : 3
Predicted output :
['3']
--------------------
[[-390.53802384 114.17222607 -95.74829681 -80.93382545 -200.32890427
-57.58045367 -115.86926567 -86.34978854 -102.65239523 -89.46537486]]
The predicted class : 1
Predicted output :
['1']
--------------------
[[-259.52152249 -252.03760092 -142.07346405 -229.30997309 172.14757401
-129.09194925 -92.13214917 -139.62166859 -179.71680123 -124.89564014]]
The predicted class : 4
Predicted output :
['4']
--------------------
Get the prediction for all training samples.
y_hat = clf.predict(X_train)
Lets display the classification report.
print(classification_report(y_train,y_hat))
precision recall f1-score support
0 0.98 0.95 0.97 5923
1 0.94 0.98 0.96 6742
2 0.89 0.90 0.90 5958
3 0.86 0.87 0.87 6131
4 0.89 0.94 0.91 5842
5 0.81 0.88 0.85 5421
6 0.92 0.97 0.94 5918
7 0.91 0.94 0.92 6265
8 0.92 0.77 0.84 5851
9 0.92 0.82 0.87 5949
accuracy 0.90 60000
macro avg 0.90 0.90 0.90 60000
weighted avg 0.91 0.90 0.90 60000
Now let us display the confusion matrix and relate it with the report above.
cm_display = ConfusionMatrixDisplay.from_predictions(
y_train, y_hat, values_format='.5g')
Making a Pipeline¶
Let's create a pipeline to keep the code compact.
Recall that, the MNIST dataset is clean and hence doesn't require much preprocessing.
The one potential preprocessing technique we may use is to scale the features within the range(0,1).
It is not similar to scaling down the range values between 0 and 1.
# create a list with named tuples
estimators = [('scaler', MinMaxScaler()), ('bin_clf', Perceptron())]
pipe = Pipeline(estimators)
pipe.fit(X_train,y_train_0)
Pipeline(steps=[('scaler', MinMaxScaler()), ('bin_clf', Perceptron())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()), ('bin_clf', Perceptron())])MinMaxScaler()
Perceptron()
y_hat_train_0 = pipe.predict(X_train)
cm_display = ConfusionMatrixDisplay.from_predictions(
y_train_0, y_hat_train_0, values_format='.5g')
plt.show()
Iteration vs Loss Curve¶
The other way of plotting Iteration Vs Loss Curve with the Partial_fit method.
iter = 100
bin_clf1 = Perceptron(max_iter=100,random_state=2094)
loss_clf1=[]
for i in range(iter):
bin_clf1.partial_fit(X_train,y_train_0,classes=np.array([1,-1]))
y_hat_0 = bin_clf1.decision_function(X_train)
loss_clf1.append(hinge_loss(y_train_0,y_hat_0))
plt.figure()
plt.plot(np.arange(iter), loss_clf1)
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Training Loss')
plt.show()
GridSearchCV¶
So, far we didn't perform any hyperparameter tuning & just accepted the default value for learning rate of the Perceptron class.
Now, let us search for a better learning rate using
GridSearchCV.No matter what the learning rate is, the loss will never converge to zero as the classes are not linearly separable.
from sklearn.metrics import make_scorer
scoring = make_scorer(hinge_loss,greater_is_better=False)
lr_grid = [1/2**n for n in range(1,6)]
bin_clf_gscv = GridSearchCV(Perceptron(), param_grid={'eta0':lr_grid},scoring=scoring, cv=5)
bin_clf_gscv.fit(X_train,y_train_0)
GridSearchCV(cv=5, estimator=Perceptron(),
param_grid={'eta0': [0.5, 0.25, 0.125, 0.0625, 0.03125]},
scoring=make_scorer(hinge_loss, greater_is_better=False))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=Perceptron(),
param_grid={'eta0': [0.5, 0.25, 0.125, 0.0625, 0.03125]},
scoring=make_scorer(hinge_loss, greater_is_better=False))Perceptron()
Perceptron()
bin_clf_gscv.cv_results_
{'mean_fit_time': array([0.89319444, 0.88571563, 0.85104909, 0.73002439, 0.69517179]),
'std_fit_time': array([0.19984267, 0.19527222, 0.17349208, 0.09260067, 0.04365529]),
'mean_score_time': array([0.01677132, 0.01518822, 0.01427217, 0.01503611, 0.01647668]),
'std_score_time': array([0.00137195, 0.0027181 , 0.00307773, 0.003333 , 0.00319911]),
'param_eta0': masked_array(data=[0.5, 0.25, 0.125, 0.0625, 0.03125],
mask=[False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'eta0': 0.5},
{'eta0': 0.25},
{'eta0': 0.125},
{'eta0': 0.0625},
{'eta0': 0.03125}],
'split0_test_score': array([-0.02166667, -0.02166667, -0.02166667, -0.02166667, -0.02166667]),
'split1_test_score': array([-0.0395, -0.0395, -0.0395, -0.0395, -0.0395]),
'split2_test_score': array([-0.02816667, -0.02816667, -0.02816667, -0.02816667, -0.02816667]),
'split3_test_score': array([-0.023 , -0.023 , -0.023 , -0.04416667, -0.04416667]),
'split4_test_score': array([-0.03016667, -0.03016667, -0.01983333, -0.01983333, -0.077 ]),
'mean_test_score': array([-0.0285 , -0.0285 , -0.02643333, -0.03066667, -0.0421 ]),
'std_test_score': array([0.00633772, 0.00633772, 0.00709663, 0.0096425 , 0.01918697]),
'rank_test_score': array([2, 2, 1, 4, 5])}
Well, instead of instantiating a Perceptron class with a new learning rate and re-train the model, we could simply get the best_estimator from GridSearchCV as follows.
best_bin_clf = bin_clf_gscv.best_estimator_
best_bin_clf
Perceptron(eta0=0.125)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Perceptron(eta0=0.125)
We can observe that the best learning rate is 0.125.
iter = 100
loss = []
best_bin_clf = Perceptron(max_iter=1000,random_state=2094,eta0=0.125)
for i in range(iter):
best_bin_clf.partial_fit(X_train, y_train_0, classes=np.array([1,-1]))
y_hat_0 = best_bin_clf.decision_function(X_train)
loss.append(hinge_loss(y_train_0,y_hat_0))
plt.figure()
plt.plot(np.arange(iter), loss_clf1, label='eta0=1')
plt.plot(np.arange(iter), loss, label='eta0=0.125')
plt.grid(True)
plt.legend()
plt.xlabel('Iteration')
plt.ylabel('Training Loss')
plt.show()
y_hat_train_0 = bin_clf.predict(X_train)
print(classification_report(y_train_0, y_hat_train_0))
precision recall f1-score support
-1.0 0.99 1.00 0.99 54077
1.0 0.98 0.93 0.95 5923
accuracy 0.99 60000
macro avg 0.99 0.96 0.97 60000
weighted avg 0.99 0.99 0.99 60000
Now, compare this classification report with the one when eta0 = 1
Visualizing weight vectors (Optional)¶
It will be interesting to look into the samples which are misclassified as False Positives (that is, images that are not zero but classified as zero).
# repeating the code for readability
bin_clf = Perceptron(max_iter=100)
bin_clf.fit(X_train, y_train_0)
y_hat_train_0 = bin_clf.predict(X_train)
# index of true -ve samples
idx_n = np.where(y_train_0==-1)
# index of predicted positive samples
idx_pred_p = np.where(y_hat_train_0==1)
# index of predicted negative samples
idx_pred_n = np.where(y_hat_train_0==-1)
idx_fp = np.intersect1d(idx_n, idx_pred_p)
idx_tn = np.intersect1d(idx_n,idx_pred_p)
fig, ax = plt.subplots(nrows=factor, ncols=factor, figsize=(8,6))
idx_offset = 0
for i in range(3):
index = idx_offset + i
for j in range(3):
ax[i,j].imshow(X_train[idx_fp[index+j]].reshape(28,28),cmap='gray')
# we should not use x_train_with_dummy
# GT : ground truth ; Pred : predicted
ax[i,j].set_title('GT : {0}, Pred : {1}'.format(str(y_train_0[idx_fp[index+j]]),str(y_hat_train_0[idx_fp[index+j]])))
ax[i,j].set_axis_off()
from matplotlib.colors import Normalize
w = bin_clf.coef_
w_matrix = w.reshape(28, 28)
#fig = plt.figure()
#plt.imshow(w_matrix, cmap='magma')
#plt.imshow(w_matrix, cmap='cividis')
#plt.imshow(w_matrix, cmap='viridis')
#plt.imshow(w_matrix, cmap='gray')
plt.imshow(w_matrix, cmap='inferno')
#plt.axis(False)
plt.rcParams['axes.grid'] = False
plt.colorbar()
plt.show()
#print(idx_fp.shape)
activation = w * X_train[idx_fp[0]].reshape(1, -1)
lin_out = activation.reshape(28, 28)
plt.subplot(1, 2, 1)
plt.imshow(X_train[idx_fp[0]].reshape(28, 28), cmap='gray')
plt.colorbar()
#lin_out[lin_out < 0]=0 # just set the value less than zero to zero
plt.subplot(1, 2, 2)
plt.imshow(lin_out, cmap='gray')
plt.colorbar()
plt.grid(False)
plt.axis(False)
plt.show()
Input to the signum
print(np.sum(lin_out)+bin_clf.intercept_)
[352.4343714]
activation = w*(X_train[idx_tn[0]].reshape(1, -1))
lin_out = activation.reshape(28, 28)
plt.subplot(1, 2, 1)
plt.imshow(X_train[idx_tn[0]].reshape(28, 28), cmap='gray')
plt.colorbar()
# just set the value less than zero to zero
lin_out[lin_out < 0] = 0
plt.subplot(1, 2, 2)
plt.imshow(lin_out, cmap='gray')
plt.colorbar()
plt.grid(False)
plt.axis(False)
plt.show()
Input to the signum
print(np.sum(lin_out) + bin_clf.intercept_)
[352.4343714]
Objective¶
In this notebook we will solve the same problem of recognizing Handwritten digits using Logistic regression model.
Imports¶
# Common imports
import numpy as np
from pprint import pprint
# to make this notebook's output stable across runs
np.random.seed(42)
# sklearn specific imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict
# log loss is also known as cross entropy loss
from sklearn.metrics import log_loss
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score
#scipy
from scipy.stats import loguniform
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
# global settings
mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)
mpl.rc('figure',figsize=(8,6))
# Ignore all warnings (convergence..) by sklearn
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
Handwritten Digit Classification¶
We are going to use LogisticRegression (despite it's name) to classify a given digit image. Again, we first apply the model for binary classification and then extend it to multiclass classification.
Suppose we want to recognize whether the given image is of digit zero or not (digits other than zero). Then the problem could be case as binary classification problem.
The first step is to create a dataset that contains collection of digit images (also called examples, samples) written by humans. Then each image should be labelled properly.
Fortunately, we have a standard benchmark dataset called MNIST.
from sklearn.datasets import fetch_openml
# it returns the data and labels as a panda dataframe.
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
The data matrix $X$ and the respective label vector $y$ need to be converted to numpy array by calling a to_numpy method.
X = X.to_numpy()
y = y.to_numpy()
Preprocessing¶
Unlike perceptron, where scaling the range is optional(but recommended), sigmoid requires range between 0 to 1.
Contemplate the consequence if we don't apply the scaling operation on the input datapoints.
NOTE : Do not apply mean centering as it removes zeros from the data, however zeros should be zeros in the dataset.
Since we are using only one preprocessing step, using
pipelinemay not be required.
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
print('Mean of the features : ', np.mean(X))
print('Standard Deviation : ', np.std(X))
print('Minimum value : ', np.min(X))
print('Maximum value : ', np.max(X))
Mean of the features : 0.13092720382627604 Standard Deviation : 0.3084510570135976 Minimum value : 0.0 Maximum value : 1.0
Let's get some information about the dataset.
print('Number of targets : {0} ,type : {1}'.format(X.shape[0] ,X.dtype))
print('Number of features : {0}'.format(X.shape[1]))
print()
print('Number of classes : {0} ,type : {1}'.format(len(np.unique(y)) ,y.dtype))
print('Labels : {0}'.format(np.unique(y)))
Number of targets : 70000 ,type : float64 Number of features : 784 Number of classes : 10 ,type : object Labels : ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
Note that the labels are of string data type.
Data visualization¶
num_images = 9
factor = int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols = factor, figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(X[index+j].reshape(28,28),cmap='gray')
ax[i,j].set_title('Label : {0}'.format(str(y[index+j])))
ax[i,j].set_axis_off()
Data splitting¶
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
Before proceeding further, we need to check whether the datasset is balanced or imbalanced.
We can do it by plotting the distribution of samples in each classes.
plt.figure(figsize=(10,5))
sns.histplot(data=np.int8(y_train) ,binwidth=0.45 ,bins=11)
plt.xticks(ticks=[0,1,2,3,4,5,6,7,8,9] ,label=[0,1,2,3,4,5,6,7,8,9])
plt.xlabel('Class')
plt.title('Distribution of samples')
plt.show()
Binary Classification : 0 - Detector¶
Let us start with a simple classification problem, that is, binary classification.
Since the original label vector contains 10 classes, we need to modify the number of classes to 2. Therefore, the label '0' will be changed to '1' and all other labels(1-9) will be changed to '0'
NOTE: For perceptron we set the negative labels to -1
# initialize new variables names with all 0.
y_train_0 = np.zeros((len(y_train)))
y_test_0 = np.zeros((len(y_test)))
# find indices of digit 0 image
# remember original labels are of type str not int
indx_0 = np.where(y_train=='0')
# use those indices to modify y_train_0 & y_test_0
y_train_0[indx_0] = 1
indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1
Visualization of new variables¶
num_images = 9
factor = np.int(np.sqrt(num_images))
fig, ax = plt.subplots(nrows=factor, ncols=factor,figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset+ i*(factor)
for j in range(factor):
ax[i,j].imshow(X[index+j].reshape(28,28), cmap='gray')
ax[i,j].set_title('Label : {0}'.format(str(y_train_0[index+j])))
ax[i,j].set_axis_off()
Model¶
Baseline Models¶
Let us quickly construct a baseline model with the following rule :
Count number of samples per class.
The model always output the class which has highest number of samples.
Then calculate the accuracy of the baseline model.
num_pos = len(np.where(y_train_0 == 1)[0])
num_neg = len(np.where(y_train_0 == 0)[0])
print(num_pos)
print(num_neg)
5923 54077
base_clf = DummyClassifier(strategy='most_frequent')
base_clf.fit(X_train,y_train_0)
print(base_clf.score(X_train,y_train_0))
0.9012833333333333
Now the reason is obvious. The model would have predicted 54077 samples correctly just by outputing 0 for all the input samples.
Therefore the accuracy will be $\frac{54077}{60000} = 90.12 \%$
Logistic Regression model¶
Quick recap of various components in the general settings:
Training data :
consists of features & labels or $(\mathbf X,y)$
Here, $y$ is a discrete number from a finite set.
Features in this case are pixel values of an image.
Model : $$ z = w_0x_0 + w_1x_1+ \ldots + w_mx_m$$
$$ = \mathbf w^{T} \mathbf x$$
and passing it through the sigmoid non-linear function (or Logistic function)
$$ \sigma(z)=\frac{1}{1+e^{-z}}$$
- Loss function:
\begin{equation} J(\mathbf w) = -\frac{1}{n} \mathbf \sum [y^{(i)} \log(h_w(\mathbf x^{(i)}))+(1-y^{(i)})(1-\log(h_w(\mathbf x^{(i)})))] \end{equation}
- Optimization:
Let's quickly take a look into the important parameters of the SGDClassifier() estimator:
class sklearn.linear_model.SGDClassifier (loss='hinge', * ,penalty='l2', alpha=0.0001, l1_ratio = 0.15, fit_intercept =True, max_iter =1000, tol=0.001, shuffle=True, verbose =0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate = 'optimal', eta0=0.0, power_t = 0.5, early_stopping = False, validation_fraction =0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False).
IMPORTANT : Setting the loss parameter to
loss=logmakes it a logistic regression classifier. We may refer to documentation for more details on theSGDClassifierclass.Create an instant of binary classifier (bin_sgd_clf) and call the
fitmethod to train the model.Let's use fit method of
SGDClassifier()to plot the iteration vs loss curve. (also we could usepartial_fit()method )Therefore, to capture the loss for each iterations during training we set the parameters
warm_start =Trueandmax_iter=1
Training without regularization¶
Set eta0 = 0.01,learning_rate = 'constant' and alpha = 0.
bin_sgd_clf = SGDClassifier(loss='log',
penalty='l2',
warm_start=True,
eta0=0.01,
alpha=0,
learning_rate='constant',
random_state=1729)
loss = []
iter = 100
for i in range(iter):
bin_sgd_clf.fit(X_train, y_train_0)
y_pred = bin_sgd_clf.predict_proba(X_train)
loss.append(log_loss(y_train_0, y_pred))
plt.figure()
plt.plot(np.arange(iter), loss)
plt.grid(True)
plt.xlabel('Iterations')
plt.ylabel('Label')
plt.show()
Let us calculate the training and testing accuracy of the model.
print('Training accuracy : {0:.4f}'.format(bin_sgd_clf.score(X_train,y_train_0)))
print('Testing accuracy : {0:.4f}'.format(bin_sgd_clf.score(X_test,y_test_0)))
Training accuracy : 0.9943 Testing accuracy : 0.9917
We know that accuracy alone is not a good metric for binary classification.
So let's compute Precision, recall and f1-score for the model.
y_hat_train_0 = bin_sgd_clf.predict(X_train)
cm_display = ConfusionMatrixDisplay.from_predictions(
y_train_0, y_hat_train_0, values_format='.5g')
plt.show()
print(classification_report(y_train_0,y_hat_train_0))
precision recall f1-score support
0.0 1.00 1.00 1.00 54077
1.0 0.97 0.97 0.97 5923
accuracy 0.99 60000
macro avg 0.99 0.98 0.98 60000
weighted avg 0.99 0.99 0.99 60000
Cross Validation¶
estimator = SGDClassifier(loss='log',
penalty='l2',
max_iter=100,
warm_start=False,
eta0=0.01,
alpha=0,
learning_rate='constant',
random_state=1729)
cv_bin_clf = cross_validate(estimator, X_train, y_train_0, cv=5,
scoring=['precision', 'recall', 'f1'],
return_train_score=True,
return_estimator=True)
cv_bin_clf
{'fit_time': array([1.04430604, 1.030128 , 1.03978181, 1.12593508, 0.95098424]),
'score_time': array([0.0279882 , 0.0270009 , 0.02554798, 0.02606583, 0.02599382]),
'estimator': [SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=100, random_state=1729),
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=100, random_state=1729),
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=100, random_state=1729),
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=100, random_state=1729),
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=100, random_state=1729)],
'test_precision': array([0.95538721, 0.96382429, 0.97238999, 0.96735395, 0.95952782]),
'train_precision': array([0.97419355, 0.97321046, 0.96473818, 0.97701149, 0.96404399]),
'test_recall': array([0.95861486, 0.94510135, 0.95105485, 0.95021097, 0.96033755]),
'train_recall': array([0.95589787, 0.95821903, 0.96433094, 0.95082313, 0.96200929]),
'test_f1': array([0.95699831, 0.954371 , 0.9616041 , 0.95870583, 0.95993252]),
'train_f1': array([0.96495899, 0.96565657, 0.96453452, 0.96373944, 0.96302557])}
From the above result, we can see that logistic regression is better than the perceptron.
However, it is good to check the weight values of all the features and decide whether regularization could be of any help.
weights = bin_sgd_clf.coef_
bias = bin_sgd_clf.intercept_
print('Bias :', bias)
print('Shape of weights :', weights.shape)
print('Shape of bias :', bias.shape)
Bias : [-5.01034596] Shape of weights : (1, 784) Shape of bias : (1,)
plt.figure()
plt.imshow(weights.reshape(28, 28), cmap='inferno')
plt.grid(False)
plt.colorbar()
plt.show()
plt.figure()
plt.plot(np.arange(0,784),weights[0,:])
plt.ylim(np.min(weights[0])-5,np.max(weights[0])+5)
plt.grid(True)
plt.xlabel('Feature Index')
plt.ylabel('Weight value')
plt.show()
It is interesting to observe that how many weight values are exactly zero.
Those features contribute nothing in the classification.
zero_weight_idx = np.where(weights[0]==0)
print(len(zero_weight_idx[0]))
# num_zero_w = weights.shape[1]-np.count_nonzero(weights)
# print("Number of weights with value zero".format(num_zero_w))
67
From the above plot, it is also obvious that regularization is not required.
Training with regularization¶
However, what happens to the performance of the model if we penalize, out of temptation, the weight values even to a smaller degree.
bin_sgd_clf_l2 = SGDClassifier(loss='log',
penalty='l2',
eta0=0.01,
alpha=0.001,
max_iter=1,
warm_start=True,
learning_rate='constant',
random_state=1729
)
loss = []
iter =100
for i in range(iter):
bin_sgd_clf_l2.fit(X_train, y_train_0)
y_pred = bin_sgd_clf_l2.predict_proba(X_train)
loss.append(log_loss(y_train_0,y_pred))
plt.figure()
plt.plot(np.arange(iter), loss)
plt.grid(True)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
Let us calculate training and testing accuracy.
print('Training accuracy : {0:.4f}'.format(bin_sgd_clf_l2.score(X_train,y_train_0)))
print('Testing accuracy : {0:.4f}'.format(bin_sgd_clf_l2.score(X_test,y_test_0)))
Training accuracy : 0.9907 Testing accuracy : 0.9905
Let's compute Precision, recall and f1-score for the model.
y_hat_train_0 = bin_sgd_clf_l2.predict(X_train)
cm_display = ConfusionMatrixDisplay.from_predictions(y_train_0,y_hat_train_0,values_format='.5g')
print(classification_report(y_train_0,y_hat_train_0))
precision recall f1-score support
0.0 0.99 1.00 0.99 54077
1.0 0.98 0.93 0.95 5923
accuracy 0.99 60000
macro avg 0.99 0.96 0.97 60000
weighted avg 0.99 0.99 0.99 60000
weights = bin_sgd_clf_l2.coef_
bias = bin_sgd_clf_l2.intercept_
print('Bias :', bias)
print('Shape of weights :', weights.shape)
print('Shape of bias :', bias.shape)
Bias : [-4.43002876] Shape of weights : (1, 784) Shape of bias : (1,)
plt.figure()
plt.plot(np.arange(0, 784), weights[0, :])
plt.ylim(np.min(weights[0]-3), np.max(weights[0])+3)
plt.xlabel('Feature Index')
plt.ylabel('Weight Value')
plt.grid(True)
plt.show()
Zero weights calculation
Note: Zero weights can't contribute to features.
num_zero_w = len(np.where(weights == 0)[0])
print('Number of zero weight count:', num_zero_w)
Number of zero weight count: 67
Displaying input image and its prediction¶
index = 7 # try some other index
plt.imshow(X_test[index, :].reshape(28, 28), cmap='plasma')
plt.colorbar()
pred = bin_sgd_clf_l2.predict(X_test[index].reshape(1, -1))
plt.title(str(pred))
plt.show()
Let's plot a few images and their respective predictions with SGDClassifier without regularization.
y_hat_test_0 = bin_sgd_clf.predict(X_test)
num_images = 9
factor = np.int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols = factor, figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(X_test[index+j].reshape(28,28),cmap='plasma')
ax[i,j].set_title('Prediction : {0}'.format(str(y_hat_test_0[index+j])))
ax[i,j].set_axis_off()
indx_0 = np.where(y_test_0 == 1)
zeroImgs= X_test[indx_0[0]]
zeroLabls = y_hat_test_0[indx_0[0]]
num_images = 9
factor = np.int(np.sqrt(num_images))
fig,ax = plt.subplots(nrows=factor, ncols = factor, figsize=(8,6))
idx_offset = 0
for i in range(factor):
index = idx_offset + i*(factor)
for j in range(factor):
ax[i,j].imshow(zeroImgs[index+j].reshape(28,28),cmap='plasma')
ax[i,j].set_title('Prediction : {0}'.format(str(zeroLabls[index+j])))
ax[i,j].set_axis_off()
Hyperparameter Tuning¶
We have to use
cross-validatefolds and mesure the same metrics across these folds for different values of hyper-parameters.Logistic regression uses SGD solver and hence the two important hyperparameters include :
learning rate
regularization rate
For the moment, we skip penalizing the parameters of the model and just search for a better learning rate using
RandomizedSearchCV()and draw the value from the uniform distribution.
lr_grid = loguniform(1e-2,1e-1)
Note:
lr_gridis an object that contains a method calledrvs(), which can be used to get the samples of given size.Therefore, we pass this
lr_gridobject toRandomizedSearchCV(). Internally, it makes use of thisrvs()method for sampling.
print(lr_grid.rvs(3,random_state=42))
[0.02368864 0.0892718 0.05395031]
estimator = SGDClassifier(loss='log',
penalty='l2',
max_iter=1,
warm_start=True,
eta0=0.01,
alpha=0,
learning_rate='constant',
random_state=1729)
scores = RandomizedSearchCV(estimator,
param_distributions={'eta0': lr_grid},
cv=5,
scoring=['precision', 'recall', 'f1'],
n_iter=5,
refit='f1')
scores.fit(X_train,y_train_0)
RandomizedSearchCV(cv=5,
estimator=SGDClassifier(alpha=0, eta0=0.01,
learning_rate='constant', loss='log',
max_iter=1, random_state=1729,
warm_start=True),
n_iter=5,
param_distributions={'eta0': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002A3C80566D0>},
refit='f1', scoring=['precision', 'recall', 'f1'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5,
estimator=SGDClassifier(alpha=0, eta0=0.01,
learning_rate='constant', loss='log',
max_iter=1, random_state=1729,
warm_start=True),
n_iter=5,
param_distributions={'eta0': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002A3C80566D0>},
refit='f1', scoring=['precision', 'recall', 'f1'])SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=1, random_state=1729, warm_start=True)SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=1, random_state=1729, warm_start=True)scores.cv_results_
{'mean_fit_time': array([0.29237275, 0.29635863, 0.30164819, 0.29441676, 0.30241408]),
'std_fit_time': array([0.00423318, 0.0040076 , 0.00889626, 0.00300337, 0.0108834 ]),
'mean_score_time': array([0.02680187, 0.02681022, 0.027 , 0.02519341, 0.02759819]),
'std_score_time': array([0.00040159, 0.00073386, 0.00109748, 0.00074543, 0.00135997]),
'param_eta0': masked_array(data=[0.02368863950364078, 0.08927180304353625,
0.05395030966670228, 0.039687933304443715,
0.01432249371823025],
mask=[False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'eta0': 0.02368863950364078},
{'eta0': 0.08927180304353625},
{'eta0': 0.05395030966670228},
{'eta0': 0.039687933304443715},
{'eta0': 0.01432249371823025}],
'split0_test_precision': array([0.93327909, 0.86936937, 0.90101325, 0.91304348, 0.94356846]),
'split1_test_precision': array([0.95633562, 0.94230769, 0.94839255, 0.95068027, 0.9594478 ]),
'split2_test_precision': array([0.97063903, 0.94216262, 0.95826235, 0.96397942, 0.97731239]),
'split3_test_precision': array([0.97604259, 0.96910856, 0.97508897, 0.97513321, 0.97781721]),
'split4_test_precision': array([0.97053726, 0.95693368, 0.96715644, 0.9672696 , 0.9738676 ]),
'mean_test_precision': array([0.96136672, 0.93597638, 0.94998271, 0.9540212 , 0.96640269]),
'std_test_precision': array([0.01548989, 0.03479022, 0.02605513, 0.02195786, 0.01322732]),
'rank_test_precision': array([2, 5, 4, 3, 1]),
'split0_test_recall': array([0.96875 , 0.97804054, 0.97635135, 0.97550676, 0.96030405]),
'split1_test_recall': array([0.94341216, 0.95185811, 0.94679054, 0.94425676, 0.93918919]),
'split2_test_recall': array([0.94852321, 0.94852321, 0.94936709, 0.94852321, 0.94514768]),
'split3_test_recall': array([0.92827004, 0.92658228, 0.92489451, 0.92658228, 0.92995781]),
'split4_test_recall': array([0.94514768, 0.93755274, 0.9443038 , 0.94767932, 0.94345992]),
'mean_test_recall': array([0.94682062, 0.94851138, 0.94834146, 0.94850966, 0.94361173]),
'std_test_recall': array([0.0129795 , 0.01722875, 0.01645645, 0.01567437, 0.00986919]),
'rank_test_recall': array([4, 1, 3, 2, 5]),
'split0_test_f1': array([0.9506838 , 0.92050874, 0.93717065, 0.94324214, 0.9518627 ]),
'split1_test_f1': array([0.94982993, 0.94705882, 0.94759087, 0.94745763, 0.94921041]),
'split2_test_f1': array([0.95945369, 0.94533221, 0.95379398, 0.95618886, 0.96096096]),
'split3_test_f1': array([0.95155709, 0.94736842, 0.94932871, 0.95023799, 0.9532872 ]),
'split4_test_f1': array([0.95767422, 0.94714408, 0.95559351, 0.95737425, 0.95842263]),
'mean_test_f1': array([0.95383975, 0.94148246, 0.94869555, 0.95090017, 0.95474878]),
'std_test_f1': array([0.00393621, 0.01051201, 0.00645012, 0.00530696, 0.00431804]),
'rank_test_f1': array([2, 5, 4, 3, 1])}
Let us pick the best estimator from the results
best_bin_clf = scores.best_estimator_
best_bin_clf
SGDClassifier(alpha=0, eta0=0.01432249371823025, learning_rate='constant',
loss='log', max_iter=1, random_state=1729, warm_start=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier(alpha=0, eta0=0.01432249371823025, learning_rate='constant',
loss='log', max_iter=1, random_state=1729, warm_start=True)y_hat_train_best_0 = best_bin_clf.predict(X_train)
print(classification_report(y_train_0, y_hat_train_best_0))
precision recall f1-score support
0.0 0.99 1.00 0.99 54077
1.0 0.98 0.92 0.95 5923
accuracy 0.99 60000
macro avg 0.99 0.96 0.97 60000
weighted avg 0.99 0.99 0.99 60000
Other Evaluation metrics¶
1. Precision / Recall Tradeoff¶
y_scores = bin_sgd_clf.decision_function(X_train)
precisions, recalls, thresholds = precision_recall_curve(y_train_0,y_scores)
plt.figure(figsize=(10,4))
plt.plot(thresholds,precisions[:-1],'r--',label='precisions')
plt.plot(thresholds,recalls[:-1],'b-',label='recalls')
plt.title('Precision / Recall Tradeoff' ,fontsize=16)
plt.legend(loc='best')
plt.grid(True)
plt.xlabel('thresholds')
plt.show()
2. Precision Recall Curve¶
plt.figure(figsize=(10, 4))
plt.plot(recalls[:-1], precisions[:-1], 'b-')
plt.title('Precision Recall Curve', fontsize=16)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(True)
plt.show()
3. ROC curve¶
fpr, tpr, thresholds = roc_curve(y_train_0, y_scores)
plt.figure(figsize=(10, 4))
plt.plot(fpr, tpr, linewidth=2, label='Perceptron')
plt.plot([0, 1], [0, 1], 'k--', label='Best_estimator')
plt.title('ROC Curve', fontsize=16)
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.grid(True)
plt.legend()
plt.show()
4. ROC-AUC score¶
auc = roc_auc_score(y_train_0, y_scores)
print('AUC : {0:.6f}'.format(auc))
AUC : 0.998627
Classsification using Ridge Classifier¶
Ridge Classifier casts the problem as the least-squares classification & finds the optimal weight using some matrix decompostion technique such as Singular-Value Decompostion (SVD).
To train the ridge classifier, the labels should be $y ∈ {+1 ,-1}$.
The classifer also by default implements L2 regularization. However, we first implement it without regularization by setting
alpha = 0
Importing new libraries¶
# Common imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.stats import loguniform
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from pprint import pprint
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
sns.set()
# global settings
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rc('figure', figsize=(8, 6))
import warnings
warnings.filterwarnings('ignore')
Getting Data¶
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
Data Preprocessing and Splitting¶
X = X.to_numpy()
y = y.to_numpy()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# initialize new variables names with all -1.
y_train_0 = np.ones((len(y_train)))
y_test_0 = np.ones((len(y_test)))
# find indices of digit 0 image
# remember original labels are of type str not int
indx_0 = np.where(y_train == '0')
# use those indices to modify y_train_0 & y_test_0
y_train_0[indx_0] = 1
indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1
Model Building¶
First taking a look into the parameters of the class :
RidgeClassifier (
alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', positive=False, random_state=None,
)
Note : The parameter normalize is depreciated.
estimator = RidgeClassifier(normalize=False ,alpha=0)
pipe_ridge = make_pipeline(MinMaxScaler() ,estimator)
pipe_ridge.fit(X_train ,y_train_0)
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))])MinMaxScaler()
RidgeClassifier(alpha=0, normalize=False)
Checking on performance of model
y_hat_test_0 = pipe_ridge.predict(X_test)
print(classification_report(y_test_0, y_hat_test_0))
precision recall f1-score support
1.0 1.00 1.00 1.00 10000
accuracy 1.00 10000
macro avg 1.00 1.00 1.00 10000
weighted avg 1.00 1.00 1.00 10000
Cross Validation¶
cv_ridge_clf = cross_validate(
pipe_ridge,
X_train ,y_train_0 ,cv=5,
scoring=['precision' ,'recall', 'f1'],
return_train_score=True ,
return_estimator=True)
pprint(cv_ridge_clf)
{'estimator': [Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))]),
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))]),
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))]),
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))]),
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))])],
'fit_time': array([4.25958037, 4.51285076, 4.62808609, 4.67159963, 4.39858842]),
'score_time': array([0.09599876, 0.10518098, 0.1021142 , 0.10399866, 0.09025025]),
'test_f1': array([1., 1., 1., 1., 1.]),
'test_precision': array([1., 1., 1., 1., 1.]),
'test_recall': array([1., 1., 1., 1., 1.]),
'train_f1': array([1., 1., 1., 1., 1.]),
'train_precision': array([1., 1., 1., 1., 1.]),
'train_recall': array([1., 1., 1., 1., 1.])}
Best estimator ID
best_estimator_id = np.argmax(cv_ridge_clf['train_f1'])
best_estimator_id
0
Best Estimator
best_estimator = cv_ridge_clf['estimator'][best_estimator_id]
best_estimator
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('ridgeclassifier', RidgeClassifier(alpha=0, normalize=False))])MinMaxScaler()
RidgeClassifier(alpha=0, normalize=False)
Lets evaluate the performance of the best classsifier on the test set.
y_hat_test_0 = best_estimator.predict(X_test)
print(classification_report(y_test_0 ,y_hat_test_0))
precision recall f1-score support
1.0 1.00 1.00 1.00 10000
accuracy 1.00 10000
macro avg 1.00 1.00 1.00 10000
weighted avg 1.00 1.00 1.00 10000
Further Exploration¶
Let's see what these classifiers learnt about the digit 0.
# models = (pipe_sgd ,pipe_sgd_l2 ,pipe_logit ,pipe_ridge)
# titles = ('SGD' ,'Regularized SGD', 'Logit' ,'Ridge')
# plt.figure(figsize=(5,5))
# for i in range(0,4):
# w = models[i][1].coef_
# w_matrix = w.reshape(28,28)
# plt.subplot(2,2,i+1)
# plt.imshow(w_matrix ,cmap='gray')
# plt.title(titles[i])
# plt.axis('off')
# plt.grid(False)
# plt.show()
Multiclass Classifier (OneVsAll)¶
In this notenook, we will implement multiclass classification using LogisticRegression with both :
SGD i.e. SGDClassifier(loss='log')
solvers i.e. LogisticRegression(solver='lbfgs')
Imports¶
# Common imports
import numpy as np
from pprint import pprint
# to make this notebook's output stable across runs
np.random.seed(42)
# sklearn specific imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import log_loss
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score
#scipy
from scipy.stats import loguniform
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
# global settings
mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)
mpl.rc('figure',figsize=(8,6))
import warnings
warnings.filterwarnings('ignore')
Getting Data¶
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
Data Preprocessing and Splitting¶
X = X.to_numpy()
y = y.to_numpy()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
Multiclass LogisticRegression using SGDClassifier¶
Model Building¶
estimator = SGDClassifier(loss='log', penalty="l2", alpha=0, max_iter=1, random_state=1729, learning_rate="constant", eta0=0.01 ,warm_start=True)
pipe_sgd_ova = make_pipeline(MinMaxScaler() ,estimator)
loss = []
iter = 100
for i in range(iter):
pipe_sgd_ova.fit(X_train ,y_train)
y_pred = pipe_sgd_ova.predict_proba(X_train)
loss.append(log_loss(y_train ,y_pred))
Visualization of Loss VS iterations
plt.figure()
plt.plot(np.arange(iter), loss)
plt.grid(True)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
What happened behind the screen is that the library automatically created 10 binary classifiers and trained them.
During the inference time, the input will be passed through all the 10 classifiers and the highest score among the outputs will be considered as the predicted class.
To see it in action, let's execute the following lines of code :
pipe_sgd_ova[1]
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=1, random_state=1729, warm_start=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier(alpha=0, eta0=0.01, learning_rate='constant', loss='log',
max_iter=1, random_state=1729, warm_start=True)pipe_sgd_ova[1].coef_
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
pipe_sgd_ova[1].coef_.shape
(10, 784)
So it is a matrix of size $10$ X $784$ . A row represents the weights of a single binary classifier.
y_hat = pipe_sgd_ova.predict(X_test)
y_hat[:5]
array(['7', '2', '1', '0', '4'], dtype='<U1')
Evaluating Metrics¶
cm_display = ConfusionMatrixDisplay.from_predictions(y_test ,y_hat, values_format='.5g')
plt.show()
print(classification_report(y_test ,y_hat))
precision recall f1-score support
0 0.93 0.98 0.96 980
1 0.96 0.98 0.97 1135
2 0.92 0.89 0.91 1032
3 0.88 0.91 0.90 1010
4 0.91 0.94 0.92 982
5 0.90 0.84 0.87 892
6 0.95 0.94 0.95 958
7 0.93 0.91 0.92 1028
8 0.88 0.85 0.87 974
9 0.89 0.89 0.89 1009
accuracy 0.92 10000
macro avg 0.91 0.91 0.91 10000
weighted avg 0.91 0.92 0.91 10000
Multiclass LogisticRegression using solvers¶
Model Building¶
pipe_logreg_ova = make_pipeline(MinMaxScaler() ,LogisticRegression(solver='lbfgs' ,C=np.infty ,random_state=1729))
pipe_logreg_ova.fit(X_train ,y_train)
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('logisticregression',
LogisticRegression(C=inf, random_state=1729))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('logisticregression',
LogisticRegression(C=inf, random_state=1729))])MinMaxScaler()
LogisticRegression(C=inf, random_state=1729)
Making predictions¶
y_hat = pipe_logreg_ova.predict(X_test)
Evaluating Metrics¶
cm_display = ConfusionMatrixDisplay.from_predictions(y_test ,y_hat ,values_format='.5g')
plt.show()
print(classification_report(y_test ,y_hat))
precision recall f1-score support
0 0.95 0.98 0.97 980
1 0.96 0.98 0.97 1135
2 0.93 0.90 0.91 1032
3 0.90 0.91 0.91 1010
4 0.94 0.93 0.93 982
5 0.91 0.88 0.89 892
6 0.93 0.95 0.94 958
7 0.94 0.93 0.93 1028
8 0.88 0.88 0.88 974
9 0.91 0.92 0.92 1009
accuracy 0.93 10000
macro avg 0.92 0.92 0.92 10000
weighted avg 0.93 0.93 0.93 10000
Visualize weight values¶
w = pipe_logreg_ova[1].coef_
# normalization
w = MinMaxScaler().fit_transform(w)
fig, ax = plt.subplots(3,3)
index = 1
for i in range(3):
for j in range(3):
ax[i][j].imshow(w[index, :].reshape(28,28) ,cmap='gray')
ax[i][j].set_title('w{0}'.format(index))
ax[i][j].set_axis_off()
index += 1
Text Classification using Naive Bayes classifier¶
In this notebook, we will use Niave Bayes classifier for classifying text.
Naive bayes is used for text classification & spam detection tasks.
Here is an example as to how to perform the text classification with Naive Bayes Classifier.
import numpy as np
# data loading
from sklearn.datasets import fetch_20newsgroups
# preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
# model / estimator
from sklearn.naive_bayes import MultinomialNB
# pipeline utilty
from sklearn.pipeline import Pipeline
# model evaluation
from sklearn.metrics import ConfusionMatrixDisplay
# plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
Getting dataset¶
We will be using 20 newsgroup dataset for classification.
As a first step, let's download 20 newsgroup dataset with fetch_20newsgroup API.
data = fetch_20newsgroups()
Lets look at the name of the classes.
data.target_names
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
There are 20 categories in the dataset. For simplicity, we will select 4 of these categories and download their training and test set.
categories = ['talk.religion.misc',
'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train' ,categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
Lets look at a sample training document :
print(train.data[1])
From: MANDTBACKA@finabo.abo.fi (Mats Andtbacka)
Subject: Re: If There Were No Hell
Organization: Unorganized Usenet Postings UnInc.
Lines: 26
In <May.5.02.51.25.1993.28737@athos.rutgers.edu> shellgate!llo@uu4.psi.com writes:
> Here's a question that some friends and I were debating last night.
> Q: If you knew beyond all doubt that hell did not exist and that
> unbelievers simply remained dead, would you remain a Christian?
(Reasoning pertinent to believing Xians deleted for space)
It strikes me, for no apparent reason, that this is reversible.
I.e., if I had proof that there existed a hell, in which I would be
eternally punished for not believing in life, would that make me a Xian?
(pardon my language) _Bloody_hell_no_!
...Of course, being merely a reversal of your thinking, this
doesn't add anything _new_ to the debate, but...
> Several friends disagreed, arguing the fear of hell was necessary
> to motivate people to Christianity. To me that fatally undercuts the
> message that God is love.
A point very well taken, IMNSHO.
--
"Successful terrorism is called revolution, and is admired by history.
Unsuccessful terrorism is just lowly, cowardly terrorism."
- Phil Trodwell on alt.atheism
This data is different than what we have seen so far. Here the training data contains document in text form.
Data Preprocessing and Modeling¶
As we have mentioned in the first week of MLT, we need to convert th text data to numeric form.
TfidfVectorizeris one such API that converts text input into a vector of numerical values.We will use
TfidfVectorizeras as preprocessing step to obtain feature vector corresponding to the text document.We will be using
MultinomialNBclassifier for categorizing documents from 20 newsgroup corpus.
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer() ,MultinomialNB())
Lets train the model.
model.fit(train.data ,train.target)
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
('multinomialnb', MultinomialNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
('multinomialnb', MultinomialNB())])TfidfVectorizer()
MultinomialNB()
Model Evaluation¶
Lets first predict the labels for the test set and then calculate the confusion matrix for th test set.
ConfusionMatrixDisplay.from_estimator(model, test.data ,test.target ,display_labels=test.target_names ,xticks_rotation='vertical')
plt.show()
Obsereve that :
There is a confusion between the documents of class
soc.religion.christianandtalk.religion.misc,which is along the expected lines.The classes
comp.graphicsandsci.spaceare well separated by such a simple classifier.
Now we have the tool to classify statements into one of these four classes.
- Make use of
predictfunction on pipeline for predicting category of a test string.
def predict_category(s, train=train ,model=model):
pred = model.predict([s])
return train.target_names[pred[0]]
Using the function of prediction :
predict_category('sending a payload to the ISS')
'sci.space'
predict_category('what is your screen resolution')
'comp.graphics'
predict_category('the Seven Sacraments are')
'soc.religion.christian'
predict_category('discussing islam')
'soc.religion.christian'
Here we can observe the confusion between the classes of soc.religion.christian and talk.religion.misc mentioned previously.
Softmax Regression on MNIST¶
The objective of this notebook is to demonstrate softmax regression in classification task.
We make use of MNIST dataset for multiclass classification of images into digits they represent.
Importing Libraries¶
# Common imports
import numpy as np
from pprint import pprint
# to make this notebook's output stable across runs
np.random.seed(42)
# sklearn specific imports
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
from sklearn.pipeline import make_pipeline ,Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression ,LogisticRegressionCV
from sklearn.model_selection import cross_validate, RandomizedSearchCV, cross_val_predict
# log loss is also known as cross entropy loss
from sklearn.metrics import log_loss
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve ,make_scorer ,f1_score
from sklearn.metrics import roc_curve, roc_auc_score
#scipy
from scipy.stats import loguniform
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
# global settings
mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)
mpl.rc('figure',figsize=(8,6))
import warnings
warnings.filterwarnings('ignore')
Data Loading¶
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
It returns the data and labels as a panda dataframe.
Data Splitting¶
X = X.to_numpy()
y = y.to_numpy()
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
Model Building¶
We scale the input features with StandardScaler and use LogisticRegression estimator with multi_class parameter set to multinomial and using sag solver.
pipe = Pipeline([('scaler',StandardScaler()),
('logreg',LogisticRegression(multi_class='multinomial',solver='sag'))])
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()),
('logreg',
LogisticRegression(multi_class='multinomial', solver='sag'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('logreg',
LogisticRegression(multi_class='multinomial', solver='sag'))])StandardScaler()
LogisticRegression(multi_class='multinomial', solver='sag')
pipe.score(X_train,y_train)
0.9305833333333333
pipe.score(X_test,y_test)
0.9249
image = pipe[-1].coef_[4].reshape(28, 28)
plt.imshow(image)
<matplotlib.image.AxesImage at 0x166cb5a2be0>
After training the model with the training feature matrix and labels, we learn the model parameters.
pipe[-1].coef_.shape
(10, 784)
pipe[-1].intercept_.shape
(10,)
pipe[-1].classes_
array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype=object)
Model Evaluation¶
print(classification_report(y_test ,pipe.predict(X_test)))
precision recall f1-score support
0 0.95 0.98 0.97 980
1 0.96 0.98 0.97 1135
2 0.94 0.90 0.92 1032
3 0.91 0.91 0.91 1010
4 0.92 0.94 0.93 982
5 0.91 0.87 0.89 892
6 0.93 0.95 0.94 958
7 0.92 0.93 0.92 1028
8 0.88 0.88 0.88 974
9 0.91 0.91 0.91 1009
accuracy 0.92 10000
macro avg 0.92 0.92 0.92 10000
weighted avg 0.92 0.92 0.92 10000
Most of the classses have f1_score greater than 90%, which is considered to be a good f1_score.
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test)
plt.show()
Using LogisticRegressionCV¶
X_tr, X_te, y_tr, y_te = X[:10000], X[10000:10500], y[:10000], y[10000:10500]
scorer = make_scorer(f1_score, average='micro')
pipe = Pipeline([('scaler', StandardScaler()),
('logreg', LogisticRegressionCV(cv=3,
multi_class='multinomial', solver='sag',
scoring=scorer, max_iter=100, random_state=1729))])
Note : takes quite a while to finish training (almost 10 mins)
pipe.fit(X_tr,y_tr)
Pipeline(steps=[('scaler', StandardScaler()),
('logreg',
LogisticRegressionCV(cv=3, multi_class='multinomial',
random_state=1729,
scoring=make_scorer(f1_score, average=micro),
solver='sag'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('logreg',
LogisticRegressionCV(cv=3, multi_class='multinomial',
random_state=1729,
scoring=make_scorer(f1_score, average=micro),
solver='sag'))])StandardScaler()
LogisticRegressionCV(cv=3, multi_class='multinomial', random_state=1729,
scoring=make_scorer(f1_score, average=micro),
solver='sag')Learning the model parameters.¶
pipe[-1].C_
array([0.04641589, 0.04641589, 0.04641589, 0.04641589, 0.04641589,
0.04641589, 0.04641589, 0.04641589, 0.04641589, 0.04641589])
pipe[-1].l1_ratio_
array([None, None, None, None, None, None, None, None, None, None],
dtype=object)
Model Evaluation¶
print(classification_report(y_te ,pipe.predict(X_te)))
precision recall f1-score support
0 0.98 0.98 0.98 56
1 0.92 0.95 0.93 57
2 0.92 0.94 0.93 51
3 0.94 0.94 0.94 49
4 0.91 0.93 0.92 46
5 0.98 0.91 0.94 46
6 0.90 0.90 0.90 50
7 0.98 0.96 0.97 51
8 0.72 0.78 0.75 40
9 1.00 0.94 0.97 54
accuracy 0.93 500
macro avg 0.93 0.92 0.92 500
weighted avg 0.93 0.93 0.93 500
ConfusionMatrixDisplay.from_estimator(pipe, X_te, y_te)
plt.show()
Importing Libraries¶
# Common imports
import numpy as np
from pprint import pprint
# to make this notebook's output stable across runs
np.random.seed(42)
# sklearn specific imports
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
from sklearn.pipeline import make_pipeline ,Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier, LogisticRegression ,LogisticRegressionCV
from sklearn.model_selection import cross_validate, RandomizedSearchCV,GridSearchCV, cross_val_predict ,learning_curve
from sklearn.neighbors import KNeighborsClassifier
# log loss is also known as cross entropy loss
from sklearn.metrics import log_loss
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve ,make_scorer ,f1_score
from sklearn.metrics import roc_curve, roc_auc_score
#scipy
from scipy.stats import loguniform
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
# global settings
mpl.rc('axes',labelsize=14)
mpl.rc('xtick',labelsize=12)
mpl.rc('ytick',labelsize=12)
mpl.rc('figure',figsize=(8,6))
import warnings
warnings.filterwarnings('ignore')
Handwritten Digit Classification¶
Dataset¶
Each datapoint is contained in $x_i$ ∊ $\mathbb{R}^{784}$ and the label $y_i$ ∊ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.to_numpy()
y = y.to_numpy()
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
Binary Classification¶
Changing labels to binary¶
Let us do binary classification with KNN classifier and eventually extend it to Multiclass setup.
# initialize new variable names with all -1
y_train_0 = -1*np.ones(len(y_train))
y_test_0 = -1*np.ones(len(y_test))
# find indices of digit 0 image
indx_0 = np.where(y_train == '0')
# remember original labels are of type str not int, so use those indices to modify y_train_0 & y_test_0
y_train_0[indx_0] = 1
indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1
Data Visualization in Lower Dimensions¶
Let us apply PCA on the datapoints and reduce the dimensions to 2D and then to 3D.
This will give us some rough idea about the points in $ \mathbb {R}^{784}$
One interesting thing to look at is the change in the magnitude of the data points before and after applying PCA.
from sklearn.decomposition import PCA
pipe_pca_2d = make_pipeline(MinMaxScaler(), PCA(n_components=2))
X_train_pca_2d = pipe_pca_2d.fit_transform(X_train)
Visualization of the 2D data obtained through PCA
plt.figure(figsize=(10, 10))
cmap = matplotlib.colors.ListedColormap(['r', 'b'])
sns.scatterplot(x=X_train_pca_2d[:, 0], y=X_train_pca_2d[:, 1],
data=X_train_pca_2d, hue=y_train_0, palette=cmap)
plt.show()
Projection in 3D using PCA
pipe_pca_3d = make_pipeline(MinMaxScaler() ,PCA(n_components=3))
X_train_pca_3d = pipe_pca_3d.fit_transform(X_train)
import plotly.express as px
cmap = matplotlib.colors.ListedColormap(['r', 'b'])
fig = px.scatter_3d(x=X_train_pca_3d[:,0],
y=X_train_pca_3d[:,1],
z=X_train_pca_3d[:,2],
color=y_train_0,
color_discrete_map=cmap,
opacity=0.5)
fig.show()
KNN classifier¶
Algorithm :¶
Set $k$ to desired value i.e. how many neighbors should be allowed to participate in prediction.
Calculate the distance between the new example and every example from the data. Thus, creating a distance vector.
Get indices of nearest $k$ neighbors.
Get the labels of the selected $k$ entries.
If it is a classification task, return the majority class by computing mode of $k$ labels.
To understand the working of sklearn built-in function, we first create a KNN classifier model with $k$=3 and consider a smaller number of samples of training and test set.
The
KNeighborsClassifercreates a classifier instance.There are many optional arguments such as
n_neighbors, metric, weights,.... that can be set to suitable values while creating an instance.
Creating a new pipeline for classifier :
We use the variables pipe_pca_2d for preprocessing the samples alone and pipe_clf_pca_2d for classification.
pipe_clf_pca_2d = make_pipeline(pipe_pca_2d, KNeighborsClassifier(n_neighbors=3))
Let us train the model with 10 samples from training set (i.e. we are just putting 10 datapoints in the metric space, not building any parameterized model)
Then test the model with 10 datapoints from test set.
index_neg = np.argsort(y_train_0)[:5]
index_pos = np.argsort(y_train_0)[-1:-6:-1]
# create a small dataset
x = np.vstack((X_train[index_pos, :], X_train[index_neg, :]))
y = np.hstack((y_train_0[index_pos], y_train_0[index_neg]))
y
array([ 1., 1., 1., 1., 1., -1., -1., -1., -1., -1.])
pipe_clf_pca_2d.fit(x,y)
Pipeline(steps=[('pipeline',
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('pca', PCA(n_components=2))])),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=3))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pipeline',
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('pca', PCA(n_components=2))])),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=3))])Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('pca', PCA(n_components=2))])MinMaxScaler()
PCA(n_components=2)
KNeighborsClassifier(n_neighbors=3)
# for visulization
x_reduced = pipe_clf_pca_2d[0].transform(x)
plt.figure(figsize=(6, 4))
sns.scatterplot(x=x_reduced[:, 0], y=x_reduced[:, 1],
hue=y, marker='o', palette=['r', 'b'])
plt.grid(True)
plt.show()
y_hat_0 = pipe_clf_pca_2d.predict(X_test[:10,:])
print('Test label : ',y_test_0[:10])
print('Predicted Label : ',y_hat_0[:10])
Test label : [-1. -1. -1. 1. -1. -1. -1. -1. -1. -1.] Predicted Label : [ 1. -1. 1. 1. -1. 1. -1. -1. 1. 1.]
ConfusionMatrixDisplay.from_predictions(y_test_0[:10],y_hat_0)
plt.show()
Observe that :
We can see that there are more FP's (as 9 out 10 actual labels are negative)
Let us display both the training points and testing points with their predictions.
We can visually validate the reason behind the performance.
cmap = matplotlib.colors.ListedColormap(['r', 'b'])
plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_reduced[:, 0], y=x_reduced[:, 1],
marker='o', hue=y, palette=cmap)
x_test_reduced = pipe_clf_pca_2d[0].transform(X_test[:10, :])
sns.scatterplot(x=x_test_reduced[:, 0], y=x_test_reduced[:, 1],
s=100, marker='*', hue=y_test_0[:10], palette=cmap, legend=None)
dx, dy = -0.2, 0.2
for i in range(10):
plt.annotate(str(y_hat_0[i]), xy=(x_test_reduced[i, 0]+dx, x_test_reduced[i, 1]+dy))
plt.grid(True)
plt.show()
It would be much better if we know the distance of 3 neighbors for each testing points.
Let us display the distance and connnectivity of neighbors to the test datapoints using th class
NearestNeighbors.In fact,
KNeighborsClassifiercallsNearestNeighborsclass internally to compute all these distances.
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=3)
neighbors.fit(pipe_pca_2d.transform(x))
NearestNeighbors(n_neighbors=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NearestNeighbors(n_neighbors=3)
which are the first three closest neighbors to the first three in the test set. And how close are they?
dist_neighbors, idx_neighbors = neighbors.kneighbors(pipe_pca_2d.transform(X_test[:10]), 3, return_distance=True)
import statistics
for i in range(3):
print('Distance : {0} \nIndex : {1} \nLabels : {2} \nPrediction : {3}'.format(dist_neighbors[i],idx_neighbors[i],y[idx_neighbors[i].flatten()],
statistics.mode(y[idx_neighbors[i].flatten()])))
print('-'*20)
Distance : [1.54510433 1.56004731 1.61914472] Index : [1 4 6] Labels : [ 1. 1. -1.] Prediction : 1.0 -------------------- Distance : [0.95703587 3.33077652 3.39001596] Index : [7 1 6] Labels : [-1. 1. -1.] Prediction : -1.0 -------------------- Distance : [1.14297879 2.0379748 2.22709669] Index : [6 4 1] Labels : [-1. 1. 1.] Prediction : 1.0 --------------------
Let us train the model with 10000 samples from training set (i.e. we are just putting 10000 datapoints in the metric space, not building any parameterized model).
pipe_clf_pca_2d.fit(X_train[:10000],y_train_0[:10000])
Pipeline(steps=[('pipeline',
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('pca', PCA(n_components=2))])),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=3))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pipeline',
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('pca', PCA(n_components=2))])),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=3))])Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('pca', PCA(n_components=2))])MinMaxScaler()
PCA(n_components=2)
KNeighborsClassifier(n_neighbors=3)
y_hat_0 = pipe_clf_pca_2d.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test_0, y_hat_0)
plt.show()
print(classification_report(y_test_0,y_hat_0))
precision recall f1-score support
-1.0 0.96 0.96 0.96 9020
1.0 0.65 0.63 0.64 980
accuracy 0.93 10000
macro avg 0.81 0.80 0.80 10000
weighted avg 0.93 0.93 0.93 10000
Let's vary the n_neighbours from k=1 to 19 and study the performance of the model.
We use the first 10000 samples from training set.
precision=[]
for k in range(1,20,2):
pipe_clf_pca_2d.__n_neighbors=k
pipe_clf_pca_2d.fit(X_train[:10000],y_train_0[:10000])
y_hat_0 = pipe_clf_pca_2d.predict(X_test)
precision.append(precision_score(y_test_0,y_hat_0))
plt.figure(figsize=(10, 8))
plt.plot(np.arange(1, 20, 2), precision)
plt.xlim((0, 20))
plt.ylim((0.64, 0.66))
plt.xlabel('k(odd values)')
plt.ylabel('Precision')
plt.xticks(ticks=np.arange(1, 20, 2), labels=np.arange(1, 20, 2))
plt.grid(True)
plt.show()
Going without PCA¶
Let us use KNN classifier with all the features in the training samples with the hope that it increases the performance of the model (of course at the cost of computation)
Let's search for $k$ by using cross validation.
NOTE : It takes about 4 minutes for entire computation.
pipe_knn = make_pipeline(MinMaxScaler(),
KNeighborsClassifier(n_neighbors=1))
grid_k = {'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9, 11]}
cv = GridSearchCV(pipe_knn, param_grid=grid_k, scoring='precision', cv=5,n_jobs=1)
cv.fit(X_train, y_train_0)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier',
KNeighborsClassifier(n_neighbors=1))]),
n_jobs=1,
param_grid={'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9,
11]},
scoring='precision')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier',
KNeighborsClassifier(n_neighbors=1))]),
n_jobs=1,
param_grid={'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9,
11]},
scoring='precision')Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=1))])MinMaxScaler()
KNeighborsClassifier(n_neighbors=1)
pprint(cv)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier',
KNeighborsClassifier(n_neighbors=1))]),
n_jobs=1,
param_grid={'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9,
11]},
scoring='precision')
pprint(cv.cv_results_)
{'mean_fit_time': array([0.47387152, 0.47961373, 0.48359861, 0.47079964, 0.48920078,
0.50660219]),
'mean_score_time': array([6.87688494, 6.80422387, 6.82959933, 6.78840113, 6.78140354,
6.97860017]),
'mean_test_score': array([0.97805545, 0.98229094, 0.98212333, 0.98245498, 0.98111657,
0.98078408]),
'param_kneighborsclassifier__n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'kneighborsclassifier__n_neighbors': 1},
{'kneighborsclassifier__n_neighbors': 3},
{'kneighborsclassifier__n_neighbors': 5},
{'kneighborsclassifier__n_neighbors': 7},
{'kneighborsclassifier__n_neighbors': 9},
{'kneighborsclassifier__n_neighbors': 11}],
'rank_test_score': array([6, 2, 3, 1, 4, 5]),
'split0_test_score': array([0.97920133, 0.98493724, 0.98166667, 0.98086522, 0.97921862,
0.97921862]),
'split1_test_score': array([0.96954733, 0.97761194, 0.97676349, 0.97838736, 0.97918401,
0.97831526]),
'split2_test_score': array([0.9775 , 0.98076923, 0.98238255, 0.98484848, 0.98313659,
0.98316498]),
'split3_test_score': array([0.98073702, 0.98155909, 0.98569024, 0.98403361, 0.98154362,
0.98402019]),
'split4_test_score': array([0.98329156, 0.98657718, 0.98411371, 0.98414023, 0.9825 ,
0.97920133]),
'std_fit_time': array([0.01160939, 0.02245838, 0.04283088, 0.01034048, 0.02839201,
0.02809244]),
'std_score_time': array([0.13517499, 0.12510482, 0.04775808, 0.07257631, 0.04769177,
0.15493347]),
'std_test_score': array([0.00466085, 0.00316604, 0.00302285, 0.00245495, 0.001644 ,
0.00233203])}
The best value obtained for k is 7. (check in test_rank_score)
pipe_knn = make_pipeline(MinMaxScaler(),KNeighborsClassifier(n_neighbors=7))
pipe_knn.fit(X_train,y_train)
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=7))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=7))])MinMaxScaler()
KNeighborsClassifier(n_neighbors=7)
Checking performance on test set¶
y_hat_0 = pipe_knn.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_hat_0)
plt.show()
Multiclass Classification¶
Extending KNN classifier to mulitclass classification is pretty simple straightforward.
pprint(pipe_knn)
Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=7))])
pipe_knn.fit(X_train, y_train)
y_hat = pipe_knn.predict(X_test)
pipe_knn.classes_
array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype=object)
print(classification_report(y_test, y_hat))
precision recall f1-score support
0 0.97 0.99 0.98 980
1 0.95 1.00 0.97 1135
2 0.98 0.96 0.97 1032
3 0.97 0.97 0.97 1010
4 0.98 0.96 0.97 982
5 0.97 0.97 0.97 892
6 0.98 0.99 0.98 958
7 0.96 0.96 0.96 1028
8 0.99 0.94 0.96 974
9 0.96 0.95 0.96 1009
accuracy 0.97 10000
macro avg 0.97 0.97 0.97 10000
weighted avg 0.97 0.97 0.97 10000
Overview¶
We know that k-NN can be used in addressing regressing problems.
In this notebook, we will demonstrate the use of k-NN in regression setup with California housing dataset, where we try to predict price of a house based on its features.
Imports¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler ,PolynomialFeatures
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split ,GridSearchCV ,RandomizedSearchCV
STEP 1 : Training Data¶
Loading the dataset¶
This dataset can be fetched from sklearn with fetch_california_housing API.
from sklearn.datasets import fetch_california_housing
X,y = fetch_california_housing(return_X_y=True)
Lets check the shape of feature matrix and label vector.
print('Shape of feature matrix : ' ,X.shape)
print('Shape of label vector : ',y.shape)
Shape of feature matrix : (20640, 8) Shape of label vector : (20640,)
Perform quick sanity check to make sure we have same number of rows in the feature matrix as well as the label vector.
assert(X.shape[0] == y.shape[0])
Split data into train & test sets¶
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=8)
print('Shape of training feature matrix : ' ,X_train.shape)
print('Shape of training label vector : ',y_train.shape)
print()
print('Shape of test feature matrix : ' ,X_test.shape)
print('Shape of test label vector : ',y_test.shape)
Shape of training feature matrix : (14448, 8) Shape of training label vector : (14448,) Shape of test feature matrix : (6192, 8) Shape of test label vector : (6192,)
assert(X_train.shape[0] == y_train.shape[0])
assert(X_test.shape[0] == y_test.shape[0])
Preprocessing the dataset¶
We have explored California housing set in detail earlier in the course
In order to refresh your memory, we have bargraphs corresponding to all the features and the output label plotted here.
california_housing = fetch_california_housing(as_frame=True)
california_housing.frame.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.5, wspace=0.4)
Observe that :
The features are on a different scale and we need to bring them on the same scale for k-NN.
k-NN uses Euclidean distance computation to identify the nearest neighbors to identify the nearest neighbors and it is crucial to have all the features on the same scale for that.
If all the features are not on the same scale, the feature with wider variance would dominate the distance calculation.
STEP 2 : Model Building¶
We instantiate a pipeline object with two stages;
The first stage performs feature scaling with
MinMaxScaler.And the second stage performs k-NN regressor with
n_neighbors=2. In short, we are using 2-NN that is we use the price of the two nearest houses in feature space to decide the price of the new house.The model is trained with feature matrix and label vector from training set.
After the model is trained, it is evaluated with the test set using the
mean squared errormetric.
pipe = Pipeline([('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=2))])
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=2))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=2))])MinMaxScaler()
KNeighborsRegressor(n_neighbors=2)
y_pred = pipe.predict(X_test)
error = mean_squared_error(y_test, y_pred, squared=False)
print(error)
0.6767822465759739
STEP 3: Model Selection and Evaluation¶
k-NN classifier has $k$, the number of neighbors, as a hyperparameter.
There are a couple of ways to tune the hyper-parameter
Manual hyper-parameter tuning
Using
GridSearchCVorRandomizedSearchCV.
We will demonstrate both Manual as well as Grid-Search based hyperparameter tuning.
3.A. Manual HPT with cross-validation¶
Here we train and evaluate the model pipeline with different values of k-1 to 31.
rmse_val = []
for K in range(1, 31):
pipe = Pipeline([('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=K))])
# fit the model
pipe.fit(X_train, y_train)
# make prediction on test set
pred = pipe.predict(X_test)
# calculate rmse
error = mean_squared_error(y_test, pred, squared=False)
# store rmse values
rmse_val.append(error)
At the end of this loop, we get a list of RMSEs-one for each value of k.
We plot the learning curve with $k$ on x-axis and RMSE on y-axis.
The value of k that results in the lowest RMSE is the best value of k that we select.
plt.figure(figsize=(10, 10))
plt.plot(range(1, len(rmse_val)+1), rmse_val, color='red')
plt.xlabel('Different values of K', fontsize=12)
plt.ylabel('RMSE', fontsize=12, rotation=0)
plt.grid(True)
plt.title('Validations Loss vs K', fontsize=16)
plt.show()
rmse_val.index((min(rmse_val)))
8
3.B. HPT with GridSearchCV¶
We set up the parameter grid for values of k of our interest.
Here we use the values between 1 and 31.
The object of
GridSearchCVis instantiated with aKNeighborsRegressorestimator along with the parameter grid and number of cross-validation folds equal to 10.The grid search is performed by calling the
fitmethod with training feature matrix and labels as arguments.
param_grid = {'knn__n_neighbors': list(range(1,31))}
pipe = Pipeline([('scaler',MinMaxScaler()),
('knn',KNeighborsRegressor())
])
gs = GridSearchCV(pipe,param_grid = param_grid,
cv =10,
n_jobs=1,
return_train_score=True)
gs.fit(X_train,y_train)
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1,
param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30]},
return_train_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1,
param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30]},
return_train_score=True)Pipeline(steps=[('scaler', MinMaxScaler()), ('knn', KNeighborsRegressor())])MinMaxScaler()
KNeighborsRegressor()
Lets evaluate the best estimator on the test set.
gs.best_estimator_
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=6))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=6))])MinMaxScaler()
KNeighborsRegressor(n_neighbors=6)
gs.best_params_
{'knn__n_neighbors': 6}
Making predictions on the test set
y_pred = gs.best_estimator_.predict(X_test)
mean_squared_error(y_test,y_pred,squared=False)
0.6255268557053962
3.C. HPT with RandmizedSearchCV¶
param_grid = {'knn__n_neighbors': list(range(1,31))}
pipe = Pipeline([('scaler',MinMaxScaler()),
('knn',KNeighborsRegressor())
])
rs = RandomizedSearchCV(pipe, param_distributions=param_grid,
n_jobs=1,refit=True, cv=10,
return_train_score=True)
rs.fit(X_train,y_train)
RandomizedSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1,
param_distributions={'knn__n_neighbors': [1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11,
12, 13, 14, 15, 16,
17, 18, 19, 20, 21,
22, 23, 24, 25, 26,
27, 28, 29, 30]},
return_train_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=10,
estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1,
param_distributions={'knn__n_neighbors': [1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11,
12, 13, 14, 15, 16,
17, 18, 19, 20, 21,
22, 23, 24, 25, 26,
27, 28, 29, 30]},
return_train_score=True)Pipeline(steps=[('scaler', MinMaxScaler()), ('knn', KNeighborsRegressor())])MinMaxScaler()
KNeighborsRegressor()
Lets evaluate the best estimator on the test set.
rs.best_estimator_
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=13))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor(n_neighbors=13))])MinMaxScaler()
KNeighborsRegressor(n_neighbors=13)
rs.best_params_
{'knn__n_neighbors': 13}
Making predictions on the test set
y_pred = rs.best_estimator_.predict(X_test)
mean_squared_error(y_test,y_pred,squared=False)
0.6217404460866567
3.D. GridSearchCV + Polynomial Features¶
In addition, we perform polynomial transformation on the features followed by scaling before using it in the nearest neighbor regressor.
params = {'poly__degree': list(range(1, 4))}
pipe = Pipeline([('poly', PolynomialFeatures()),
('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())])
gs_poly = GridSearchCV(estimator=pipe,
param_grid=params,
cv=10,
n_jobs=1)
gs_poly.fit(X_train, y_train)
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1, param_grid={'poly__degree': [1, 2, 3]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())]),
n_jobs=1, param_grid={'poly__degree': [1, 2, 3]})Pipeline(steps=[('poly', PolynomialFeatures()), ('scaler', MinMaxScaler()),
('knn', KNeighborsRegressor())])PolynomialFeatures()
MinMaxScaler()
KNeighborsRegressor()
gs_poly.best_estimator_
Pipeline(steps=[('poly', PolynomialFeatures(degree=1)),
('scaler', MinMaxScaler()), ('knn', KNeighborsRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('poly', PolynomialFeatures(degree=1)),
('scaler', MinMaxScaler()), ('knn', KNeighborsRegressor())])PolynomialFeatures(degree=1)
MinMaxScaler()
KNeighborsRegressor()
We evaluate the model with the test set.
y_pred = gs_poly.best_estimator_.predict(X_test)
error = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE value of k is :', error)
RMSE value of k is : 0.6313551708664618
Outline¶
In this notebook, we study how to handle large-scale datasets in sklearn.
In this course, so far we were able to load entire data in memory and were able to train and make inferences on all the data at once.
The large scale data sets may not fit in memory and we need to devise strategies to handle it in the context of training and prediction use cases.
In this notebook, we will discuss the following topics :
Overview of handling large-scale data.
Incremental preprocessing and learning i.e.
fit()vspartial_fit():partial_fitis our friend in this cases.Combining preprocessing and incremental learning
Large-scale Machine Learning¶
Large-scale Machine Learning differs from traditional machine learning in the sense that it involves processing large amount of data in terms of its size or number of samples, features or classes
There were many exciting developements in efficient large scale learning on many real world use cases in the last decade.
Although scikit-learn is optimized for smaller data, it does offer a decent set of feature preprocessing and learning algorithms for large scale data such as classification, regression and clustering.
Scikit-learn handles large data through
partial_fit()method instead of using the usualfit()method.
The idea is to process data in batches and update the model parameters for each batch. This way of learning is referred to as Incremental (or out-or-core) learning.
Incremental Learning¶
Incremental learning may be required in the following two scenarios :
For out-of-memory (large) datasets ,where it's not possible to load the entire data into the RAM at once, one can load the data in chunks and fit the training model for each chunk of data.
For machine learning tasks where a new batch of data comes with time,re-training the model with the previous and new batch of data is a computationally expensive process.
Instead of re-training the model with the entire set of data, one can employ an incremental learning approach, where the model parameters are updated with the new batch of data.
Incremental Learning in sklearn¶
To perform incremental learning, Scikit-learn implements partial_fit method that helps in training an out-of-memory dataset.
In other words, it has the ability to learn incrementally from a batch of instances.
In this notebook, we will see an example of how to read, process, and train on such a large dataset that can't be loaded in memory entirely.
This method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core (online) learning.
This function has some performance overhead, so it's recommended to call it on a considerable large batch of data(that fits into the memory), to overcome the limitation of overhead.
partial_fit() attributes :¶
partial_fit(X,y,[classes], [sample_weight])
where,
X: array of shape(n_samples, n_features) where n_samples is the number of samples & n_features is the number of features.y: array of shape (n_samples,) of target values.classes: array of shape(n_classes,) containing a list of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequet calls.sample_
weight: (optional) array of shape(n_samples,) containing weights applied to individual samples(1.for unweighted)
Returns: object(self)
For classification tasks, we have to pass the list of possible target class labels in classes parameter to cope-up with the unseen target classes in the 1st batch of the data.
The following estimators implement partial_fit method :
Classification :
MultinomialNB
BernoulliNB
SGDClassifier
Perceptron
Regression :
- SGDRegressor
Clustering :
MiniBatchKmean
SGDRegressor and SGDClassifier are commonly used for handling large data.
The problem with standard regression / classification implementations such as batch gradient descent, support vector machines (SVMs), random forest etc. is that because of the need to load all the data into memory at once, they can not be used in scenarios where we do not have sufficient memory.
SGD, however, can deal with large data sets effectively by breaking up the data into chunks and processing them sequentially.
The fact that we only need to load one chunk into memory at a time makes it useful for large-scale data as well as cases where we get streams of data at intervals.
fit() versus partial_fit()¶
Below, we show the use of partial_fit() along with SGDClassifier.
For the purpose of illustration, we first use traditional fit() and then use partial_fit() on the same data.
Importing Libraries¶
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report ,ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')
Traditional Approach [using fit()]¶
Sample dataset
We will use a synthetic classification dataset for demonstration. Let us have 50000 samples with 10 features matrix.
Further, lets have 3 classes in the target label, each class having a single cluster.
X, y = make_classification(n_samples=50000, n_features=10,
n_classes=3,
n_clusters_per_class=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
We will make use of SGDClassifier to learn the classification model.
clf1 = SGDClassifier(max_iter=1000, tol=0.01)
We will use traditional fit() method to train out model.
clf1.fit(X_train, y_train)
SGDClassifier(tol=0.01)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier(tol=0.01)
Let's obtain the training and test scores on the trained model.
train_score = clf1.score(X_train, y_train)
train_score
0.9223764705882352
test_score = clf1.score(X_test, y_test)
test_score
0.9224
We obtain the confusion matrix and classification report for evaluating the classifier.
from sklearn.metrics import ConfusionMatrixDisplay
y_pred = clf1.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.91 0.97 0.94 2535
1 0.90 0.94 0.92 2511
2 0.97 0.85 0.90 2454
accuracy 0.92 7500
macro avg 0.93 0.92 0.92 7500
weighted avg 0.92 0.92 0.92 7500
Incremental Approach [using partial_fit()]¶
We will now assume that the data can't be kept completely in the main memory and hence, will load chunks of data and fit usng partial_fit().
X_train[:5]
array([[-2.29159844, 0.02091326, 0.83692475, 0.40334635, -0.85712345,
0.10922827, 0.12431572, 2.28815044, 0.08941231, 1.3970629 ],
[ 0.92861135, 0.70557977, 0.04412851, 0.72054533, 0.09660703,
0.30020664, 2.11156696, -1.11936906, -0.44856979, 0.01935755],
[-0.19039778, -0.45235961, -0.64982729, 0.34137055, -0.34691607,
0.21451974, -0.44904781, 0.55076812, 0.78134819, 0.33105366],
[ 1.1583492 , 0.49090667, 1.06375715, -0.51689404, 1.1209298 ,
-0.27734821, 0.64698903, -1.90202329, -1.64475696, -0.78198161],
[ 0.97511431, -0.58098048, 0.21484446, -0.07367407, 1.17026029,
0.09359779, 0.77655646, -1.25990625, -0.65231873, 0.17549224]])
y_train[:5]
array([2, 1, 0, 1, 1])
In order to load data chunk, we will first store the given (training) data in a CSV file.
This is just for demonstration purpose. In a real-case scenario, the large dataset might already be in the form of say, a CSV file which we will be reading in multiple iterations.
train_data = np.concatenate((X_train, y_train[:,np.newaxis]), axis=1)
train_data[:5]
array([[-2.29159844, 0.02091326, 0.83692475, 0.40334635, -0.85712345,
0.10922827, 0.12431572, 2.28815044, 0.08941231, 1.3970629 ,
2. ],
[ 0.92861135, 0.70557977, 0.04412851, 0.72054533, 0.09660703,
0.30020664, 2.11156696, -1.11936906, -0.44856979, 0.01935755,
1. ],
[-0.19039778, -0.45235961, -0.64982729, 0.34137055, -0.34691607,
0.21451974, -0.44904781, 0.55076812, 0.78134819, 0.33105366,
0. ],
[ 1.1583492 , 0.49090667, 1.06375715, -0.51689404, 1.1209298 ,
-0.27734821, 0.64698903, -1.90202329, -1.64475696, -0.78198161,
1. ],
[ 0.97511431, -0.58098048, 0.21484446, -0.07367407, 1.17026029,
0.09359779, 0.77655646, -1.25990625, -0.65231873, 0.17549224,
1. ]])
a = np.asarray(train_data)
np.savetxt('train_data.csv',a, delimiter=',')
Now, our data for demonstration is ready in a csv file.
Let's create SGDClassifier object that we intend to train with partial_fit().
clf2 = SGDClassifier(max_iter=1000, tol=0.01)
Processing data chunk by chunk¶
Pandas' read_csv() function has an attribute
chunksizethat can be used to read data chunk by chunk.The
chunksizeparameter specifies the number of rows per chunk. (The last chunk may contain fewer than chunksize rows, of course.)We can then use this data for
partial_fit().We can then repeat these two steps multiple times. That way entire data may not be required to be kept in memmory.
import pandas as pd
chunksize = 1000
iter = 1
for train_df in pd.read_csv('train_data.csv', chunksize=chunksize, iterator=True):
# print(train_data.shape)
if iter ==1:
# print(train_df)
# In the first iteration, we are specifying all possible class labels.
X_train_partial = train_df.iloc[:,0:10]
y_train_partial = train_df.iloc[:,10]
clf2.partial_fit(X_train_partial,y_train_partial,
classes=np.array([0,1,2]))
else:
X_train_partial = train_df.iloc[:,0:10]
y_train_partial = train_df.iloc[:,10]
clf2.partial_fit(X_train_partial,y_train_partial)
print("After iter # : ", iter)
print(clf2.coef_)
print()
print(clf2.intercept_)
print('-'*30)
iter+=1
After iter # : 1
[[ 19.38830885 -9.8407469 -45.97717728 -5.87280243 0.42674605
6.21765462 3.55850528 0.1417346 41.08967472 -13.94867119]
[ 12.92169609 15.81072392 18.0450216 -9.00425388 12.78754988
18.86647869 9.1221019 -24.31521829 -24.99480569 -14.25634197]
[-29.47922706 8.40986526 24.23256001 13.41447864 1.94339806
-3.96856641 -2.62356224 22.68345616 -13.33739481 14.6376807 ]]
[-59.03986614 -74.98556842 -6.04887573]
------------------------------
After iter # : 2
[[ -0.43318992 -1.0332125 -33.40136516 0.25054696 3.98749852
-1.00136424 2.54789093 17.25781737 36.12161057 2.72497412]
[ 22.4240924 1.12713107 2.65918168 3.41619314 2.26348056
10.95051098 -12.5037477 -27.82945064 -12.54647862 12.96506279]
[-18.262215 -1.03959563 15.93677646 -5.23414935 8.99892245
3.27627136 4.47310495 13.58859847 -9.25742246 0.18387247]]
[-50.34539254 -50.4647003 -4.39114887]
------------------------------
After iter # : 3
[[ 1.59753532 0.34706688 -21.75556115 0.96276257 -0.31501835
1.98008086 1.60594408 9.01962823 22.71546625 1.25182019]
[ 20.63431088 -8.23821557 6.28481279 6.59239713 1.29684542
8.12150785 -0.89386001 -27.53237941 -15.67401586 0.51485952]
[ -7.25862109 -1.04801571 8.00963145 4.54380002 -0.0818056
-7.61300846 -5.35724345 4.56109616 -5.48186436 0.67083948]]
[-23.06547087 -36.41730191 -4.36901345]
------------------------------
After iter # : 4
[[ 4.12979806 -3.25754191 -17.44502475 -0.50180349 4.72890367
0.26726567 -0.8340666 3.86640099 16.98426266 -0.63411815]
[ 6.35059183 -4.84304363 1.29398638 -4.2035575 -4.03819594
-3.40641356 -10.33899167 -8.15259174 -4.13512923 -1.69966893]
[ -7.60896558 -0.62568421 7.60573801 -4.1237475 -3.96908967
-7.54492315 4.45514816 5.17755662 -4.8960158 3.82540408]]
[-27.31709493 -36.20669914 4.08371098]
------------------------------
After iter # : 5
[[ 0.78636273 0.88430578 -13.90820641 2.41506199 2.04141795
-4.00897905 -4.62919637 6.04379268 14.62334255 -2.23188323]
[ 11.64286411 5.07689199 4.43427592 -2.12207189 -1.29634325
0.31799044 -0.6742446 -15.98032997 -9.79946464 4.4346227 ]
[ -8.63695637 2.60007172 7.98740706 -4.27058413 1.92929935
0.43010927 -4.71942397 6.20087622 -4.86261087 -1.06754917]]
[-20.19527231 -21.26607071 2.9874717 ]
------------------------------
After iter # : 6
[[ 5.66763313e+00 2.18980763e+00 -1.40673290e+01 2.14111227e+00
4.74504070e+00 -1.53960803e+00 1.80866526e+00 3.55872027e-01
1.26861684e+01 2.10331381e+00]
[ 2.69700626e+00 -6.98074825e-03 1.74210040e+00 -1.07587636e+00
-3.18587477e+00 3.95901745e+00 7.45153546e+00 -4.06018805e+00
-3.03913665e+00 2.59978972e+00]
[-7.56359601e+00 1.74320666e+00 4.32950198e+00 8.80019215e-01
3.55218798e+00 5.25153111e+00 -6.85714986e-01 6.76650729e+00
-1.39090861e+00 1.14249253e+00]]
[-16.91491035 -19.89733634 -4.87166076]
------------------------------
After iter # : 7
[[ 2.17487785 1.61152085 -12.9117516 1.00578869 -1.12701292
-0.21537709 -0.25258184 3.90354829 12.95157665 -4.31652165]
[ 9.27413961 7.55853696 1.60543694 -4.14975418 0.27643622
0.10506277 -1.49716021 -11.76319642 -5.73296644 2.89124642]
[ -6.50848168 -3.24427134 4.66122605 3.19002412 1.99994282
3.96090081 0.53941647 5.35347496 -2.20352539 4.16345605]]
[-16.77205487 -19.83694959 -3.27313782]
------------------------------
After iter # : 8
[[ 1.14023291 -0.14622242 -9.07827058 -3.47535 0.51534122
0.23476031 -0.3641228 3.20414518 9.27426056 2.60493271]
[ 8.45627242 1.67487973 2.58713351 0.80371066 -2.25326848
-4.6416978 -3.85497924 -11.28898552 -6.43585329 -0.79821464]
[ -6.44012417 -0.54232307 4.41593144 3.43927972 1.44664993
0.81713453 2.65367474 5.39568371 -1.96915341 4.05105577]]
[-10.91632851 -19.98054603 0.175747 ]
------------------------------
After iter # : 9
[[ 2.93286530e+00 -2.07431528e+00 -1.02058901e+01 2.24896060e+00
1.44864272e+00 4.77360353e-01 2.41001411e+00 1.65131331e+00
9.71310726e+00 1.49158994e+00]
[ 5.59776041e+00 2.76891146e+00 3.98802124e+00 -2.44307752e+00
-1.76966142e+00 1.21873812e+00 1.79062306e+00 -8.61371733e+00
-6.70830787e+00 2.02885834e+00]
[-6.15177247e+00 1.28721872e+00 4.80675563e+00 -1.64093148e+00
-4.06449104e+00 -5.45789417e-01 -1.87251820e-03 4.85902612e+00
-2.51416506e+00 -5.10965604e-02]]
[ -9.94175275 -15.9018863 -6.03259008]
------------------------------
After iter # : 10
[[ 0.57372779 0.93741165 -9.16639127 -1.61574541 -0.65766851 -1.37974049
1.43400391 3.9177061 9.61375451 0.05674784]
[ 2.28534626 0.75683994 1.82925278 -0.33898239 -1.63492281 0.67328324
2.65288817 -3.6174659 -2.95509053 0.07243071]
[-4.93984874 -1.39276341 4.93458103 -1.69042452 0.48208759 1.20213882
-1.41807146 3.3629339 -3.17515023 0.92392828]]
[-13.76666229 -14.85896444 -2.88815854]
------------------------------
After iter # : 11
[[ 2.73435194e+00 7.23603594e-01 -7.76006535e+00 -1.74722570e+00
1.12191799e-01 -1.01200484e+00 9.90759858e-01 6.59648304e-01
7.16753651e+00 3.80164889e-01]
[ 4.94221697e+00 2.87566315e+00 -5.24459792e-01 -1.01263509e+00
8.74237195e-01 -2.96585792e+00 -1.95263343e+00 -5.57677114e+00
-1.57045467e+00 1.89240235e+00]
[-4.80436057e+00 3.60709686e-01 1.93591953e+00 1.81524724e-01
7.71107544e-03 2.24745694e-01 1.18819771e+00 4.70623711e+00
-7.59350644e-03 5.07124385e-01]]
[ -9.33088242 -13.01378207 -0.22690318]
------------------------------
After iter # : 12
[[ 3.78681810e-01 5.57617906e-01 -8.70974445e+00 9.03660828e-01
7.51327096e-01 -2.87259473e-01 1.29949349e+00 3.91922954e+00
9.20672200e+00 -1.03173476e+00]
[ 4.81903951e+00 1.60775740e+00 1.11979888e+00 -6.67261996e-03
-2.62019976e-01 -1.07866815e+00 1.51464407e+00 -6.25558322e+00
-3.28620887e+00 -1.25183691e+00]
[-4.03897358e+00 2.46263347e+00 2.44603366e+00 -5.79289132e-01
-2.26814236e+00 9.92631830e-01 -8.97331013e-01 3.54610907e+00
-8.86987559e-01 4.55196466e-01]]
[ -7.77521691 -12.18083387 1.23535232]
------------------------------
After iter # : 13
[[-0.25713377 -0.34849415 -6.50014505 1.12048002 -0.73089061 -0.61008638
0.86796394 3.56271273 7.1041755 2.73770579]
[ 5.25976768 -2.13089839 -1.4790284 0.44365129 -0.90931694 -3.67277652
1.34161944 -5.47341001 -0.68065171 3.06042055]
[-4.37953216 -0.63439005 0.71961305 -2.20809493 -2.07378852 -0.18773528
0.74492294 4.81406291 1.11746096 0.51822926]]
[ -7.76865653 -11.5314022 -2.52483825]
------------------------------
After iter # : 14
[[ 2.35735501 0.93807411 -7.43945079 -0.33451531 0.4441513 0.8141498
-1.74114398 0.9443651 6.98544283 -2.02510503]
[ 6.18867665 1.99135734 -0.64957144 1.13057564 1.40527425 1.45347398
-0.50606953 -6.98685955 -1.9742373 1.09371231]
[-3.60844935 1.37636163 4.48261476 0.92595525 -0.82584851 1.0150513
0.90348273 2.01635132 -3.26397775 3.01084848]]
[ -9.10972811 -10.84377116 -2.48626009]
------------------------------
After iter # : 15
[[ 1.06077704 -0.4824955 -6.20970444 -0.65295966 0.30158007 -1.46517846
-0.87514798 1.85985838 6.22246354 -0.84813864]
[ 3.57101878 -1.62141009 -0.34293639 1.38147117 0.49891704 -1.08968341
1.76769088 -4.04757435 -1.17348378 1.70347244]
[-3.78622137 -0.34127612 3.40356446 -0.9659685 -0.06276722 -0.9929648
-1.66025569 2.76739423 -2.026308 1.74795668]]
[-8.47543764 -9.53346309 -0.49596288]
------------------------------
After iter # : 16
[[ 1.82999217 0.3347636 -5.62263861 1.86627297 0.46268445 0.31758735
1.52459493 0.65662685 5.25862834 -1.04092488]
[ 2.46667622 -0.48919705 1.54090436 -2.84172815 0.12144922 0.40509839
0.7391874 -3.68715986 -2.7231957 -2.07704629]
[-2.54390768 0.8805303 1.24090928 0.98390018 0.10365998 -0.75904713
0.22988936 2.38373855 -0.23623034 -1.10262744]]
[ -6.64884129 -10.0865789 -2.28103332]
------------------------------
After iter # : 17
[[ 1.46705568 -0.43044773 -6.53094655 -1.61429038 -0.46180992 -0.15086572
1.18380237 1.54085701 6.39258473 -0.1236393 ]
[ 4.3649165 0.14131389 0.19569618 2.79227305 -1.07541552 0.97130011
2.12299893 -5.25568849 -2.09587372 -1.80410508]
[-2.90230291 0.16399623 1.52934326 -0.3973443 0.00724875 0.88846686
0.98637696 2.66260952 -0.39173751 0.89463231]]
[ -5.55472948 -10.0477626 -1.10933749]
------------------------------
After iter # : 18
[[ 1.21862863 0.31714157 -6.22946405 -0.52386686 1.25436158 -1.06767475
-0.33907433 1.68324789 6.17554095 1.00159607]
[ 4.77917631 0.31649481 -1.15834116 0.66254816 0.30812263 -0.1647079
0.55064288 -5.06632272 -0.81807808 -1.57668388]
[-4.84840044 -1.24476567 2.94979109 0.83806006 -0.59294345 0.23116222
-0.46847019 4.24996435 -1.07933431 -0.11644074]]
[-6.11977811 -6.83726017 -1.62104969]
------------------------------
After iter # : 19
[[ 2.784369 -1.873803 -3.69274005 0.76415092 0.63458938 0.76506103
1.01698213 -1.43862802 2.77014405 1.26311436]
[ 2.78496594 1.94027272 1.46831492 1.94494511 -0.12680849 -1.45075119
0.71024127 -4.02685713 -2.78257983 -0.72025156]
[-1.99480305 0.93257698 1.87046091 0.03770753 -1.36926451 1.71600148
-0.95204163 1.41928857 -1.15070124 0.64041802]]
[-5.66156085 -7.41107514 -1.0517265 ]
------------------------------
After iter # : 20
[[-0.03059445 0.86407141 -4.85807046 0.35936911 -0.11911422 -0.10142259
-0.25952976 2.47177112 5.23971803 0.55508446]
[ 1.72889379 1.93894267 2.50128294 -0.55339075 0.0977128 1.38042046
-0.45334133 -3.29688904 -3.4377388 0.03354658]
[-1.97924552 -1.00690739 1.68642956 1.17106647 0.99423369 1.54033551
-0.68169762 1.49317098 -0.95943282 0.05786417]]
[-5.17182715 -7.86027965 0.37124357]
------------------------------
After iter # : 21
[[ 0.7728425 -0.43586889 -4.53441023 -0.18361908 -0.61697345 0.58516672
1.26988598 1.36016303 4.54448352 0.91045153]
[ 2.15423834 1.47981503 0.09229772 1.429837 -0.52087136 2.08881383
-0.2361131 -2.59171695 -1.02977654 1.01969971]
[-3.39171174 0.1913217 3.39047495 -2.24374402 -0.50842973 0.26846219
-0.60162806 2.30780507 -2.18262613 -0.21189566]]
[-6.10692559 -8.30144723 -0.54473056]
------------------------------
After iter # : 22
[[ 0.74381059 -0.42628646 -4.20577162 0.63131011 0.40781215 0.74975744
-1.3298004 1.22970227 4.20346091 -0.53435548]
[ 1.8120276 2.07627232 1.77142705 -0.96922296 1.95115375 2.16369166
1.18561041 -3.02920243 -2.68843905 0.37548719]
[-3.24357985 -0.84565716 1.05636135 0.26558079 -0.66520459 0.48455805
-0.88380571 3.30299368 0.2645226 0.36252403]]
[-5.19636759 -8.73209711 -0.54985918]
------------------------------
After iter # : 23
[[ 3.27342103e-01 -3.55222358e-02 -4.24854785e+00 3.44489789e-01
1.54444216e+00 -7.67596825e-01 -3.26671554e-01 1.74324656e+00
4.42936627e+00 -9.88560474e-01]
[ 4.57389040e+00 -4.08345992e-03 9.29902774e-01 -2.64519583e-01
3.06510764e-01 -1.14385674e-01 4.84378381e-02 -5.87071014e+00
-2.97602405e+00 -2.16380277e+00]
[-3.28802719e+00 1.16979694e-01 1.76780372e+00 2.13646001e+00
-2.98809406e-01 8.95312678e-02 1.00472613e+00 2.99882694e+00
-4.81676977e-01 -2.27046727e-01]]
[-6.87079669 -8.72011384 -0.98900169]
------------------------------
After iter # : 24
[[ 0.39806814 0.85392451 -5.55986146 0.05746278 -0.76804008 -0.50510214
-1.57469542 2.31711125 5.8095804 -0.17002208]
[ 3.62964579 -1.89952253 1.09362719 -1.5207439 -0.94971449 -0.0800531
0.36536751 -4.83707706 -2.74431814 1.69570542]
[-1.66684561 -1.79249105 1.28449738 -0.93069556 -0.77657718 -0.69545845
-1.17235882 1.32555093 -0.66195309 0.95791629]]
[-5.65276015 -6.68920397 -1.79135941]
------------------------------
After iter # : 25
[[-0.01898631 -0.72969548 -3.86687668 -1.42871687 -0.67766877 0.9272515
0.63335385 1.96111443 4.16833885 -0.85791643]
[ 3.95061278 -1.1250714 0.39830802 -0.35450217 0.87850044 1.26983184
0.45945413 -4.86772876 -2.13490152 -1.01012395]
[-3.16063309 -0.27779233 2.12719495 0.38578092 0.3217026 1.0817817
0.20523476 2.66811557 -0.92334947 0.01881088]]
[-4.87261211 -6.72687172 -1.75744583]
------------------------------
After iter # : 26
[[-0.49453747 -1.12060109 -3.56430278 -0.16584365 0.79873991 1.85781206
1.50190239 2.37132744 4.0482225 -0.44712405]
[ 1.7827346 -0.22894652 0.61659238 0.0063293 -0.68016705 -0.5803449
-0.60202422 -2.41560706 -1.43336996 -0.24508131]
[-1.46330646 0.5329374 0.74191478 -0.11448193 0.35678164 -0.53156063
-0.85978302 1.35707636 -0.16613594 -1.07063408]]
[-4.10603797 -8.22165655 -1.71113513]
------------------------------
After iter # : 27
[[ 1.68977401 0.0635014 -4.30500649 -0.92223439 1.21465001 0.14966909
-0.48803139 0.16170606 3.9016322 -0.14029386]
[ 2.76113804 0.33993281 0.65040026 0.77709493 -0.54229342 0.5663105
0.56747699 -3.58863604 -1.89234308 1.42098246]
[-2.87272069 0.17333331 2.00819543 -0.69941743 -0.12918147 -0.47239089
-1.00592435 2.38758003 -0.91968295 -0.52783472]]
[-4.85170994 -7.46394203 -1.66979769]
------------------------------
After iter # : 28
[[ 0.3252395 1.10676098 -4.66678035 -0.31470563 0.7395233 -0.03609679
-0.1941685 1.95541419 4.88022543 1.19603552]
[ 1.42934304 -0.21339803 0.61547082 0.83135241 0.34591937 0.39720566
0.09184825 -1.99747812 -1.27952311 0.43301674]
[-2.60051406 -0.98002904 2.88041952 -0.19416701 -0.2450842 0.44202831
-0.46486949 1.62864689 -1.97563002 0.72243713]]
[-5.19745504 -6.0487155 -1.64002716]
------------------------------
After iter # : 29
[[ 0.25213266 -0.45700128 -4.01246453 0.09800696 0.42587866 0.6695858
-0.58157301 1.71375216 4.20786372 -0.56478902]
[ 0.91870629 -1.0434746 1.86110342 0.75918241 -0.34278594 -0.68595775
0.32838926 -2.0186158 -2.39906462 0.77291631]
[-2.47978482 -0.51500671 1.05300169 -1.83225913 0.88333652 -0.08844914
-0.22786206 2.40217965 -0.06176851 -0.02655215]]
[-3.51835688 -7.7229954 -0.61194507]
------------------------------
After iter # : 30
[[ 0.88823496 0.58998191 -3.95023484 -0.92917463 -0.44138137 0.23917043
0.42344633 0.93093598 3.86616324 0.54647142]
[ 2.10271793 -1.07867461 1.67201792 -0.47079813 -0.79414212 0.15519045
-1.25860005 -3.32284212 -2.70704852 -0.04625841]
[-2.68678904 0.4966921 2.2942838 0.22974526 -0.71332267 0.32719828
-0.57640357 2.02445162 -1.30777744 -1.41152609]]
[-5.15728266 -6.41452556 0.35900203]
------------------------------
After iter # : 31
[[ 0.63660633 0.28917067 -3.66070923 -0.51790163 -0.58533978 0.36584792
1.20062139 1.08310449 3.66336602 -0.90257807]
[ 3.04040502 -1.18837977 1.49528073 -0.94410081 -1.00886039 -0.34348118
-0.91236688 -4.3422028 -2.92192238 -0.17465557]
[-1.13406994 -0.76640528 1.53280497 0.62050653 -1.20092115 0.38426153
-0.37180569 0.57153404 -1.15921368 1.31902994]]
[-3.86475706 -6.41236979 -0.30510748]
------------------------------
After iter # : 32
[[ 0.70468953 -0.18013638 -3.82221085 0.65383186 -0.59045243 -0.01348844
-0.60536896 1.08362729 3.80770867 -0.29910315]
[ 1.46177397 -1.83974591 1.23356257 -1.24154523 1.04129119 -1.57986043
0.10947387 -2.34568219 -1.95849841 0.45289094]
[-2.39095714 0.16110161 1.54135367 0.3224621 -0.54460738 -0.27706659
0.6692095 2.05238307 -0.62552409 -0.02322233]]
[-4.17600948 -4.8824403 -0.9025227 ]
------------------------------
After iter # : 33
[[ 7.82272570e-01 5.23147186e-01 -3.89137433e+00 4.94805677e-04
4.09167928e-01 -1.02848471e+00 -8.16618269e-01 1.02663084e+00
3.84860706e+00 4.56647498e-01]
[ 2.33067149e+00 2.39814711e-01 5.12924208e-01 4.66101360e-01
-1.50366589e-01 -1.17816108e-01 1.19883486e+00 -3.01107339e+00
-1.55850974e+00 2.21131603e-01]
[-2.68290052e+00 1.87650948e-01 2.86336955e+00 4.36484818e-02
2.73733719e-01 5.83377509e-01 1.31142534e+00 1.73454263e+00
-1.92170185e+00 3.88470585e-01]]
[-4.1821583 -5.18693224 0.57312601]
------------------------------
After iter # : 34
[[ 0.3193092 -0.84690198 -4.12356883 -0.61808053 -0.10145616 0.80843924
0.16697511 1.69007929 4.29837857 0.42297458]
[ 1.03951633 -0.87117232 0.88142002 -0.72727718 0.14014869 1.48433628
-0.80533368 -1.67019491 -1.39726408 -0.68737935]
[-1.88339133 0.40802511 1.89111119 0.11702363 0.00740045 -0.07867069
1.43890395 1.27729162 -1.22103962 -1.018166 ]]
[-4.77290014 -4.59734854 -1.74278925]
------------------------------
After iter # : 35
[[ 0.88524704 0.70333668 -3.76599931 -0.2618525 -0.26973672 0.21714959
0.2121653 0.84209899 3.66924598 0.33314392]
[ 2.59859045 0.30231678 -0.28511431 -0.53671784 0.25517139 0.93208761
0.41588584 -2.92754473 -0.81567059 -0.50501041]
[-2.53044522 0.20548351 2.1379417 0.1351346 1.29687571 -0.50314697
0.01882263 1.91809912 -1.20710796 0.16925897]]
[-3.65777416 -4.05282439 -1.15405047]
------------------------------
After iter # : 36
[[ 0.53385971 -0.25391035 -3.44099113 1.02438303 0.25760136 0.0478077
0.51394436 1.09435275 3.47136393 0.4243917 ]
[ 1.93047718 -0.42106239 0.0677711 0.86224896 -1.23204776 1.45943301
0.26911793 -2.31502473 -0.9067408 -0.75457853]
[-2.57747611 0.1415482 1.90806619 -0.44671718 -0.0259697 -0.53468792
-0.18452661 2.08892007 -0.93948481 -0.15185867]]
[-3.92992725 -5.17277079 -0.8728966 ]
------------------------------
After iter # : 37
[[ 0.31893541 -0.48141868 -3.16201247 -0.31268324 0.6363704 -0.62617696
-0.5733622 1.20843929 3.26405984 -0.61242716]
[ 2.34281476 0.68970333 0.50750441 0.14112554 0.24575617 0.17106287
-0.15991825 -3.0227046 -1.55792395 -0.06044275]
[-2.39598826 0.39953089 1.66237814 0.19781938 0.076624 0.71237008
-0.59922338 1.99765154 -0.75355389 0.46688605]]
[-3.92014745 -5.43797461 -0.33918915]
------------------------------
After iter # : 38
[[-0.44300853 -0.42550613 -2.26870103 -0.97907595 -1.10539163 0.07429812
0.78944695 1.66088379 2.63210629 0.53091899]
[ 1.93338636 2.18712907 0.84329312 -0.15833699 0.03897163 -0.78794512
-0.75093642 -2.70727452 -1.74233447 -0.13940723]
[-1.98471869 -0.29123466 1.3088534 0.80128045 -0.35543808 0.25502799
0.31643563 1.68893799 -0.5508575 -0.33142502]]
[-3.66193757 -4.65957632 -0.86106907]
------------------------------
After iter # : 39
[[ 0.69916704 0.2648374 -3.23394675 0.32070175 0.30463575 -0.6625447
0.15163659 0.79522315 3.17721633 -0.28347378]
[ 2.75061563 0.83624566 0.07368349 -0.04356039 0.90927952 0.19919471
-0.9076899 -3.28706261 -1.26734355 -0.210637 ]
[-2.74866648 0.47131731 2.19838226 0.02007493 0.41354545 -0.43635768
0.3014896 2.14564664 -1.17787609 0.27210047]]
[-3.65738754 -5.67694427 -0.10235796]
------------------------------
After iter # : 40
[[ 0.96155122 0.25787336 -3.33402044 -0.31180206 -0.8111628 0.22035598
-0.84781411 0.53536312 3.17154808 0.13570611]
[ 2.25457292 0.75691682 0.0443828 -0.29228003 -0.1389528 -0.59253009
0.10933489 -2.68624965 -1.02156525 0.27695456]
[-2.00303289 0.03647635 1.84549261 -0.25951983 -0.6498331 -0.60163552
0.39617441 1.44153093 -1.12028465 0.83394805]]
[-2.68138419 -5.42020786 -1.09001914]
------------------------------
After iter # : 41
[[ 0.14149982 -0.1552038 -2.89724062 -0.2432251 -0.21596779 -0.24269318
-0.14624722 1.28535193 3.05584754 -0.04200232]
[ 2.42170837 -0.42887075 0.41925954 -0.29120259 -0.41141005 -0.23733535
-0.44506788 -3.07168305 -1.49706312 -0.95317384]
[-1.99616888 0.09973646 1.68604687 0.32741788 0.84281616 0.09084677
0.77562158 1.51335945 -0.95171141 -0.13840053]]
[-4.1318807 -4.94913487 0.35983073]
------------------------------
After iter # : 42
[[ 0.55598958 0.3955398 -3.2754199 -0.44788468 1.22052456 -0.07422355
-0.69754166 0.98519409 3.28367735 -0.27059053]
[ 2.17939075 1.08586237 1.07817203 0.72763327 0.61529484 0.42342922
0.35105907 -3.11571058 -2.10128299 -0.83555629]
[-2.20475771 -0.05232692 1.49252946 -0.02338595 -0.09721158 0.14752137
1.28456057 1.85684855 -0.6534221 -0.36964453]]
[-4.59009077 -5.42431084 -1.29315445]
------------------------------
After iter # : 43
[[-0.26312665 0.18591379 -3.03535684 0.19791077 0.02474518 -0.16581435
0.06587531 1.83270296 3.37920849 -0.02031577]
[ 1.59309509 -1.24224505 0.49024796 0.7850702 1.15118016 0.12128868
-0.39254078 -2.12818598 -1.21553276 -0.81486961]
[-2.33096267 -0.36798865 2.076906 -0.25360239 -1.22425305 0.6667789
-0.52529515 1.71299159 -1.22760574 0.28226229]]
[-3.42973886 -4.96024596 -1.05993316]
------------------------------
Note :
In the first call to
partial_fit(), we passed the list of possible target class labels. For subsequent calls topartial_fit(), this is not required.Observe the changing values of the classifier attributes :
coef_andintercept_which we are printing in each iteration.
test_score = clf2.score(X_test ,y_test)
test_score
0.8964
Let's evaluate the classifier by examining the confusion_matrix.
y_pred = clf2.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.90 0.97 0.93 2535
1 0.88 0.89 0.88 2511
2 0.91 0.83 0.87 2454
accuracy 0.90 7500
macro avg 0.90 0.90 0.90 7500
weighted avg 0.90 0.90 0.90 7500
Apart from SGDClassifier, we can also train Perceptron(), MultinomialNB(), in a similar manner.
CountVectorizer vs HashingVectorizer¶
Vectorizers are used to convert a collection of text documents to a vector representation, thus helping in preprocessing them before applying any model on these text documents.
CountVectorizer and HashingVectorizer both perform the task of vectorizing the text documents. However, there are some differences among them.
One difference is that
HashingVectorizerdoes not store the resulting vocabulary (i.e. the unique tokens). Hence, it can be used to learn from data that does not fit into the computer's main memory.Each mini-batch is vectorized using
HashingVectorizerso as to guarantee that the input space of the estimator has always the same dimensionality.With
HashingVectorizer, each token directly maps to a pre-defined column position in a matrix.For example, if there are 100 columns in the resultant (vectorized) matrix, each token (word) maps to 1 of the 100 columns. The mapping between the word and the position in matrix is done using hashing.
In other words in
HashingVectorizer, each token transforms to a column position instead of adding to the vocabulary.
Not storing the vocabulary is useful while handling large datasets. This is because holding a huge token vocabulary compromising of millions of words may be a challenging when the memory is limited.
Since HashingVectorizer does not store vocabulary , its object not only takes lesser space, it also alleviates any dependence with function calls performed on the previous chunk of data in case of incremental learning.
Let us take some sample text documents and vectorize them, first using CountVectorizer and then HashingVectorizer.
text_documents = ['The well-known saying an apple a day keeps the doctor away has a very straightforward, literal meaning, that the eating of fruit maintains good health.',
'The proverb fist appeared in print in 1866 and over 150 years later is advice that we still pass down through generation.',
'British apples are one of the nations best loved fruit and according to Great British Apples, we consume around 122,000 tonnes of them each year.',
'But what are the health benefits, and do they really keep the doctor away?']
1. CountVectorizer¶
We will first import the library and then create an object of CountVectorizer class.
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
We will now use this object to vectorize the input text documents using the function fit_transform().
X_c = count_vectorizer.fit_transform(text_documents)
X_c.shape
(4, 66)
Here, 66 is the size of the vocabulary.
We can also see the vocabulary using vocabulary_ attribute.
count_vectorizer.vocabulary_
{'the': 54,
'well': 62,
'known': 36,
'saying': 50,
'an': 6,
'apple': 9,
'day': 19,
'keeps': 35,
'doctor': 21,
'away': 13,
'has': 30,
'very': 60,
'straightforward': 52,
'literal': 38,
'meaning': 41,
'that': 53,
'eating': 24,
'of': 43,
'fruit': 26,
'maintains': 40,
'good': 28,
'health': 31,
'proverb': 48,
'fist': 25,
'appeared': 8,
'in': 32,
'print': 47,
'1866': 3,
'and': 7,
'over': 45,
'150': 2,
'years': 65,
'later': 37,
'is': 33,
'advice': 5,
'we': 61,
'still': 51,
'pass': 46,
'down': 22,
'through': 57,
'generation': 27,
'british': 16,
'apples': 10,
'are': 11,
'one': 44,
'nations': 42,
'best': 15,
'loved': 39,
'according': 4,
'to': 58,
'great': 29,
'consume': 18,
'around': 12,
'122': 1,
'000': 0,
'tonnes': 59,
'them': 55,
'each': 23,
'year': 64,
'but': 17,
'what': 63,
'benefits': 14,
'do': 20,
'they': 56,
'really': 49,
'keep': 34}
And 4 is the number of text documents.
Following is the representation of four text documents :
print(X_c)
(0, 54) 3 (0, 62) 1 (0, 36) 1 (0, 50) 1 (0, 6) 1 (0, 9) 1 (0, 19) 1 (0, 35) 1 (0, 21) 1 (0, 13) 1 (0, 30) 1 (0, 60) 1 (0, 52) 1 (0, 38) 1 (0, 41) 1 (0, 53) 1 (0, 24) 1 (0, 43) 1 (0, 26) 1 (0, 40) 1 (0, 28) 1 (0, 31) 1 (1, 54) 1 (1, 53) 1 (1, 48) 1 : : (2, 39) 1 (2, 4) 1 (2, 58) 1 (2, 29) 1 (2, 18) 1 (2, 12) 1 (2, 1) 1 (2, 0) 1 (2, 59) 1 (2, 55) 1 (2, 23) 1 (2, 64) 1 (3, 54) 2 (3, 21) 1 (3, 13) 1 (3, 31) 1 (3, 7) 1 (3, 11) 1 (3, 17) 1 (3, 63) 1 (3, 14) 1 (3, 20) 1 (3, 56) 1 (3, 49) 1 (3, 34) 1
2. HashingVectorizer¶
Let us now see how HashingVectorizer is different from CountVectorizer.
We will create an object of HashingVectorizer. While creating the object, we need to specify the number of features we wish to have in the feature matrix.
from sklearn.feature_extraction.text import HashingVectorizer
hashing_vectorizer = HashingVectorizer(n_features=50)
An important parameter of HashingVectorizer class is n_features. It declares the number of features (columns) in the output feature matrix.
NOTE : Small numbers of features are likely to cause hash collisions, but large numbers will cause larger coefficient dimensions in linear learners.
Let's perform hashing vectorization with fit_transform.
X_h = hashing_vectorizer.fit_transform(text_documents)
Let us examine the shape of the transformed feature matrix. The number of columns in this matrix is equal to the n_features attribute we specified.
X_h.shape
(4, 50)
Following is the representation of the four text documents :
print(X_h[0])
(0, 5) 0.0 (0, 8) -0.47140452079103173 (0, 10) -0.23570226039551587 (0, 11) -0.23570226039551587 (0, 13) 0.0 (0, 18) -0.23570226039551587 (0, 20) 0.23570226039551587 (0, 26) 0.0 (0, 29) 0.23570226039551587 (0, 33) 0.23570226039551587 (0, 36) -0.23570226039551587 (0, 38) 0.47140452079103173 (0, 39) -0.23570226039551587 (0, 45) -0.23570226039551587 (0, 46) 0.23570226039551587
IMP :
Overall, HashingVectorizer is a good choice if we are falling short of memory and resources, or we need to perform incremental learning.
However, CountVectorizer is a good choice if we need to access the actual tokens.
Demonstration¶
1. Downloading the dataset¶
We download a dataset from UCI ML datasets's library.
Instead of downloading, unzipping and then reading, we are directly reading the zipped csv file.
For that purpose, we are making use of urllib.request, BytesIO and TextIOWrapper classes.
This is a sentiment analysis dataset. There are only two columns in the dataset. One for the textual review and the other for the sentiment.
from io import StringIO, BytesIO, TextIOWrapper
from zipfile import ZipFile
import urllib.request
resp = urllib.request.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip')
zipfile = ZipFile(BytesIO(resp.read()))
data = TextIOWrapper(zipfile.open('sentiment labelled sentences/amazon_cells_labelled.txt'),encoding='utf-8')
df = pd.read_csv(data, sep='\t')
df.columns = ['review','sentiment']
2. Exploring the dataset¶
df.head()
| review | sentiment | |
|---|---|---|
| 0 | Good case, Excellent value. | 1 |
| 1 | Great for the jawbone. | 1 |
| 2 | Tied to charger for conversations lasting more... | 0 |
| 3 | The mic is great. | 1 |
| 4 | I have to jiggle the plug to get it to line up... | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 999 entries, 0 to 998 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review 999 non-null object 1 sentiment 999 non-null int64 dtypes: int64(1), object(1) memory usage: 15.7+ KB
df.describe()
| sentiment | |
|---|---|
| count | 999.000000 |
| mean | 0.500501 |
| std | 0.500250 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 1.000000 |
| 75% | 1.000000 |
| max | 1.000000 |
As we can see,
There are 999 samples in the dataset.
The possible classes for sentiment are 1 and 0.
3. Splitting data into train and test¶
X = df.loc[:,'review']
# X2 = df.iloc[:,0]
y = df.loc[:,'sentiment']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((799,), (200,), (799,), (200,))
4. Preprocessing¶
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer()
5. Creating an instance of the SGDClassifier¶
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(penalty='l2', loss='hinge')
6. Iteration 1 of partial_fit()¶
We will assume we do not have sufficient memory to handle all the 799 samples in one go for training purpose.
So, we will take the first 400 samples from the training data and
partial_fit()our classifier.Another use case of
partial_fithere could also be a scenario where we only have 400 samples available at a time. So, we fit our classifier with them.However, we
partial_fitit, to have the possibility of traning it with more data later whenever that becomes available.
X_train_part1_hashed = vectorizer.fit_transform(X_train[0:400])
y_train_part1 = y_train[0:400]
# we need to mention all classes in the first iteration of partial_fit()
all_classes = np.unique(df.loc[:, 'sentiment'])
classifier.partial_fit(X_train_part1_hashed,
y_train_part1, classes=all_classes)
SGDClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier()
Let us now use this classifier on our test data that we had kept aside earlier.
# first we will have to preprocess the X_test with the same vectorizer that was fit on the train data.
X_test_hashed = vectorizer.transform(X_test)
test_score = classifier.score(X_test_hashed, y_test)
print('Test score : ', test_score)
Test score : 0.71
Note : We can also store this classifier using pickle object and can access it later.
7. Iteration 2 of partial_fit()¶
We will now assume that more data became available. So, we will fit the same classifier with more data and observe if our test score improves.
X_train_part2_hashed = vectorizer.fit_transform(X_train[400:])
y_train_part2 = y_train[400:]
classifier.partial_fit(X_train_part2_hashed, y_train_part2)
SGDClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier()
test_score = classifier.score(X_test_hashed, y_test)
print('Test score : ', test_score)
Test score : 0.74
We see that our test score has improved after we fed more data to the classifier in the second iteration of partial_fit().
SVM classifer on MNIST¶
In this notebook, we will implement multiclass MNIST digit recognition classifier with SVM's.
Importing Libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
# Import the libraries for performing classification
from keras.datasets import mnist
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score ,train_test_split ,GridSearchCV ,StratifiedShuffleSplit
Loading MNIST dataset¶
(X_train, y_train), (X_test, y_test) = mnist.load_data()
Flatten each input image into a vector of length 784
X_train = X_train.reshape(X_train.shape[0], 28*28)
X_test = X_test.reshape(X_test.shape[0], 28*28)
Normalizing
X_train = X_train/255
X_test = X_test/255
X_train.shape ,y_train.shape ,X_test.shape ,y_test.shape
((60000, 784), (60000,), (10000, 784), (10000,))
Let us consider the first 10000 images in training dataset and first 2000 images in testing dataset.
X_train = X_train[0:10000, :]
y_train = y_train[0:10000]
X_test = X_test[0:2000, :]
y_test = y_test[0:2000]
Linear SVM for MNIST multiclass classification¶
Using Pipeline¶
pipe_1 = Pipeline([('scaler', MinMaxScaler()),
('classifier', SVC(kernel='linear', C=1))])
pipe_1.fit(X_train, y_train.ravel())
Pipeline(steps=[('scaler', MinMaxScaler()),
('classifier', SVC(C=1, kernel='linear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('classifier', SVC(C=1, kernel='linear'))])MinMaxScaler()
SVC(C=1, kernel='linear')
Evaluate the model using crossvalidation
accuracy = cross_val_score(pipe_1, X_train, y_train.ravel(), cv=3)
print('Training Accuracy : {:.4f}'.format(accuracy.mean()*100))
Training Accuracy : 91.3799
Visualizing the confusion matrix
y_pred = pipe_1.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion matrix')
plt.show()
Printing classification report
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.91 0.96 0.94 175
1 0.96 0.99 0.98 234
2 0.89 0.89 0.89 219
3 0.85 0.88 0.86 207
4 0.90 0.94 0.92 217
5 0.89 0.85 0.87 179
6 0.90 0.93 0.91 178
7 0.89 0.87 0.88 205
8 0.89 0.83 0.86 192
9 0.89 0.84 0.86 194
accuracy 0.90 2000
macro avg 0.90 0.90 0.90 2000
weighted avg 0.90 0.90 0.90 2000
Nonlinear SVM for MNIST multiclass classification¶
Using Pipeline¶
pipe_2 = Pipeline([('scalerr', MinMaxScaler()),
('classifier', SVC(kernel='rbf', gamma=0.1, C=1))])
pipe_2.fit(X_train, y_train.ravel())
Pipeline(steps=[('scalerr', MinMaxScaler()),
('classifier', SVC(C=1, gamma=0.1))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scalerr', MinMaxScaler()),
('classifier', SVC(C=1, gamma=0.1))])MinMaxScaler()
SVC(C=1, gamma=0.1)
Evaluate the model using crossvalidation
accuracy = cross_val_score(pipe_2, X_train, y_train.ravel(), cv=2)
print('Training Accuracy : {:.4f}'.format(accuracy.mean()*100))
Training Accuracy : 82.8700
Visualizing the confusion matrix
y_pred = pipe_2.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)
plt.show()
Printing classification report
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.99 0.89 0.94 175
1 0.99 0.98 0.98 234
2 0.47 0.99 0.63 219
3 0.93 0.89 0.91 207
4 0.95 0.82 0.88 217
5 0.98 0.78 0.87 179
6 0.99 0.73 0.84 178
7 0.96 0.79 0.87 205
8 0.95 0.74 0.84 192
9 0.95 0.84 0.89 194
accuracy 0.85 2000
macro avg 0.92 0.84 0.86 2000
weighted avg 0.91 0.85 0.86 2000
Using GridSearchCV¶
We can use a grid search cross-validation to explore combinations of parameters.
Here we will adjust C (which controls the margin hardness) and gamma (which controls the size of the radial basis function kernel), and determines the best models.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
takes a very long amount of time to finish training (for me it took about 7 hrs)
grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train.ravel())
GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.2,
train_size=None),
estimator=SVC(),
param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.2,
train_size=None),
estimator=SVC(),
param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})SVC()
SVC()
grid.best_params_
{'C': 10.0, 'gamma': 0.001}
grid.best_score_
0.9453333333333332
Decision Trees¶
Decision Trees are capable of finding complex non-linear relationships in the data.
They can perform both classification and regression tasks.
Decision Trees for Regression¶
In the first half of this notebook, we will demonstrate decision trees for regression task with Califiornia housing dataset and DecisionTreeRegressor class in sklearn.
Importing Libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.tree import export_text
import warnings
warnings.filterwarnings('ignore')
np.random.seed(36)
Let's use ShuffleSplit as cv with 10 splits and 20 % examples set aside as test examples.
cv = ShuffleSplit(n_splits=10,test_size=0.2,random_state=42)
Loading the dataset¶
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
Data Splitting¶
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels, random_state=42)
Model Setup¶
dt_reg_pipeline = Pipeline([('scaler', StandardScaler()),
('dt_reg', DecisionTreeRegressor(max_depth=3, random_state=42))])
dt_reg_cv_results = cross_validate(dt_reg_pipeline,
com_train_features,
com_train_labels,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
dt_reg_train_error = -1 * dt_reg_cv_results['train_score']
dt_reg_test_error = -1 * dt_reg_cv_results['test_score']
print(f'Mean absolute error of linear regression model on the train set : \n'
f'{dt_reg_train_error.mean():.3f}+/- {dt_reg_train_error.std():.3f}')
print()
print(f'Mean absolute error of linear regression model on the test set : \n'
f'{dt_reg_test_error.mean():.3f}+/- {dt_reg_test_error.std():.3f}')
Mean absolute error of linear regression model on the train set : 0.590+/- 0.005 Mean absolute error of linear regression model on the test set : 0.593+/- 0.007
Visualizing the tree¶
One of the advantages of using a decision tree classifier is that the output is intuitive to understand and can be easily visualized.
This can be done in two ways:
As a tree digram
As a text based diagram
1. As a tree diagram¶
We need to call fit function on pipeline object before printing the tree.
dt_reg_pipeline.fit(train_features, train_labels)
Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=3, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=3, random_state=42))])StandardScaler()
DecisionTreeRegressor(max_depth=3, random_state=42)
plt.figure(figsize=(25,5))
a = tree.plot_tree(dt_reg_pipeline[-1],
feature_names=features.columns,
rounded=True,
filled=True,
fontsize=12)
plt.show()
2. As a text-based diagram¶
# export the decision rules
tree_rules = export_text(dt_reg_pipeline[-1])
print(tree_rules)
|--- feature_0 <= 0.62 | |--- feature_0 <= -0.41 | | |--- feature_2 <= -0.45 | | | |--- value: [1.65] | | |--- feature_2 > -0.45 | | | |--- value: [1.17] | |--- feature_0 > -0.41 | | |--- feature_5 <= -0.26 | | | |--- value: [2.84] | | |--- feature_5 > -0.26 | | | |--- value: [1.89] |--- feature_0 > 0.62 | |--- feature_0 <= 1.61 | | |--- feature_5 <= -0.12 | | | |--- value: [3.51] | | |--- feature_5 > -0.12 | | | |--- value: [2.66] | |--- feature_0 > 1.61 | | |--- feature_0 <= 2.37 | | | |--- value: [3.94] | | |--- feature_0 > 2.37 | | | |--- value: [4.72]
Using the tree for prediction¶
test_labels_pred = dt_reg_pipeline.predict(test_features)
Evaluating the tree¶
mae = mean_absolute_error(test_labels, test_labels_pred)
mse = mean_squared_error(test_labels, test_labels_pred)
r2 = r2_score(test_labels, test_labels_pred)
print('MAE : ',mae)
print('MSE : ',mse)
print('R2 score : ',r2)
MAE : 0.6005762942842664 MSE : 0.6417557936098145 R2 score : 0.5150037690483743
HPT using GridSearchCV¶
Let us now try to improve the model by tuning the hyperparameters.
param_grid = {'dt_reg__max_depth': range(1, 20),
'dt_reg__min_samples_split': range(2, 8)}
dt_grid_search = GridSearchCV(dt_reg_pipeline,
param_grid=param_grid,
n_jobs=2,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True)
dt_grid_search.fit(com_train_features, com_train_labels)
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=3,
random_state=42))]),
n_jobs=2,
param_grid={'dt_reg__max_depth': range(1, 20),
'dt_reg__min_samples_split': range(2, 8)},
return_train_score=True, scoring='neg_mean_absolute_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size=0.2, train_size=None),
estimator=Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=3,
random_state=42))]),
n_jobs=2,
param_grid={'dt_reg__max_depth': range(1, 20),
'dt_reg__min_samples_split': range(2, 8)},
return_train_score=True, scoring='neg_mean_absolute_error')Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=3, random_state=42))])StandardScaler()
DecisionTreeRegressor(max_depth=3, random_state=42)
dt_grid_search.best_params_
{'dt_reg__max_depth': 11, 'dt_reg__min_samples_split': 5}
print('Mean cross validated score of the best estimator : ', -
1*dt_grid_search.best_score_)
Mean cross validated score of the best estimator : 0.4283313044233501
mean_train_error = -1 * \
dt_grid_search.cv_results_['mean_train_score'][dt_grid_search.best_index_]
mean_test_error = -1 * \
dt_grid_search.cv_results_['mean_test_score'][dt_grid_search.best_index_]
std_train_error = -1 * \
dt_grid_search.cv_results_['std_train_score'][dt_grid_search.best_index_]
std_test_error = -1 * \
dt_grid_search.cv_results_['std_test_score'][dt_grid_search.best_index_]
print(f'Best Mean absolute error of decision tree regression model on the train set: \n'f'{mean_train_error:.3f} +/- {std_train_error:.3f}')
print()
print(f'Best Mean absolute error of decision tree regression model on the test set: \n'f'{mean_test_error:.3f} +/- {std_test_error:.3f}')
Best Mean absolute error of decision tree regression model on the train set: 0.278 +/- -0.006 Best Mean absolute error of decision tree regression model on the test set: 0.428 +/- -0.012
Let's retrain the model with the best hyperparameter value.
dt_reg_pipeline.set_params(dt_reg__max_depth=11, dt_reg__min_samples_split=5).fit(com_train_features, com_train_labels)
Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=11, min_samples_split=5,
random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('dt_reg',
DecisionTreeRegressor(max_depth=11, min_samples_split=5,
random_state=42))])StandardScaler()
DecisionTreeRegressor(max_depth=11, min_samples_split=5, random_state=42)
Evaluating after HPT.
test_labels_pred = dt_reg_pipeline.predict(test_features)
mae = mean_absolute_error(test_labels, test_labels_pred)
mse = mean_squared_error(test_labels, test_labels_pred)
r2 = r2_score(test_labels, test_labels_pred)
print('MAE : ',mae)
print('MSE : ',mse)
print('R2 score : ',r2)
MAE : 0.4248952183820017 MSE : 0.4154957726373763 R2 score : 0.6859960039130073
Decision Trees using Pipelines¶
For this section of the notebook, we will use Abalone data.
Loading the dataset¶
Abalone is a type of consumable snail whose price varies as per its age.
The aim is to predict the age of abalone from physical measurements.
The age of abalone is traditionally determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope-a boring and time-consuming task.
Other measurements, which are easier to obtain, are used to predict the age.
column_names = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', header=None,names=column_names)
abalone_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4177 entries, 0 to 4176 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sex 4177 non-null object 1 Length 4177 non-null float64 2 Diameter 4177 non-null float64 3 Height 4177 non-null float64 4 Whole weight 4177 non-null float64 5 Shucked weight 4177 non-null float64 6 Viscera weight 4177 non-null float64 7 Shell weight 4177 non-null float64 8 Rings 4177 non-null int64 dtypes: float64(7), int64(1), object(1) memory usage: 293.8+ KB
abalone_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Length | 4177.0 | 0.523992 | 0.120093 | 0.0750 | 0.4500 | 0.5450 | 0.615 | 0.8150 |
| Diameter | 4177.0 | 0.407881 | 0.099240 | 0.0550 | 0.3500 | 0.4250 | 0.480 | 0.6500 |
| Height | 4177.0 | 0.139516 | 0.041827 | 0.0000 | 0.1150 | 0.1400 | 0.165 | 1.1300 |
| Whole weight | 4177.0 | 0.828742 | 0.490389 | 0.0020 | 0.4415 | 0.7995 | 1.153 | 2.8255 |
| Shucked weight | 4177.0 | 0.359367 | 0.221963 | 0.0010 | 0.1860 | 0.3360 | 0.502 | 1.4880 |
| Viscera weight | 4177.0 | 0.180594 | 0.109614 | 0.0005 | 0.0935 | 0.1710 | 0.253 | 0.7600 |
| Shell weight | 4177.0 | 0.238831 | 0.139203 | 0.0015 | 0.1300 | 0.2340 | 0.329 | 1.0050 |
| Rings | 4177.0 | 9.933684 | 3.224169 | 1.0000 | 8.0000 | 9.0000 | 11.000 | 29.0000 |
Let's now see the type and name of the features :
Sex: The is the gender of the abalone and has categorical value (M, F or I)
Length: The longest measurement of the abalone shell in mm. Continuous numeric value.
Diameter: The measurement of the abalone shell perpendicular to lenght in mm. Continuous numeric value.
Height: Height of the shell in mm. Continuous numeric value. Whole Weight: Weight of the abalone in grams. Continous numeric value.
Shucked Weight: Weight of just the meat in abalone in grams. Continuous numeric value.
Viscera Weight: Weight of the abalone after bleeding in grams. Continuous numeric value.
Shell Weight: Weight of the abalone after being dried in grams. Continuous numeric value.
Rings: This is the target, that is the feature that we will train the model to predict. As mentioned earlier, we are interested in the age of the abalone and it has been established that number of rings + 1.5 gives the age. Discrete numeric value.
Visualization of Abalone Dataset¶
abalone_data.hist(bins=50, figsize=(15,15))
plt.show()
sns.pairplot(abalone_data, diag_kind='hist')
plt.show()
sns.heatmap(abalone_data.iloc[:, :-1].corr(), annot=True, square=True)
plt.show()
sns.boxplot(data=abalone_data.iloc[:, :-1], orient='h', palette='Set2')
plt.show()
We find different features to be having different ranges through this box-plot, which indicates that scaling the features may be useful.
Preprocessing¶
From the information above, all features are continuous variables except for the Sex feature.
Handling Missing values¶
abalone_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Length | 4177.0 | 0.523992 | 0.120093 | 0.0750 | 0.4500 | 0.5450 | 0.615 | 0.8150 |
| Diameter | 4177.0 | 0.407881 | 0.099240 | 0.0550 | 0.3500 | 0.4250 | 0.480 | 0.6500 |
| Height | 4177.0 | 0.139516 | 0.041827 | 0.0000 | 0.1150 | 0.1400 | 0.165 | 1.1300 |
| Whole weight | 4177.0 | 0.828742 | 0.490389 | 0.0020 | 0.4415 | 0.7995 | 1.153 | 2.8255 |
| Shucked weight | 4177.0 | 0.359367 | 0.221963 | 0.0010 | 0.1860 | 0.3360 | 0.502 | 1.4880 |
| Viscera weight | 4177.0 | 0.180594 | 0.109614 | 0.0005 | 0.0935 | 0.1710 | 0.253 | 0.7600 |
| Shell weight | 4177.0 | 0.238831 | 0.139203 | 0.0015 | 0.1300 | 0.2340 | 0.329 | 1.0050 |
| Rings | 4177.0 | 9.933684 | 3.224169 | 1.0000 | 8.0000 | 9.0000 | 11.000 | 29.0000 |
The count row shows that there are no missing values.
However, in the Height feature, the minimum value is zero. This possibility calls for a missing value in the data and we will process tha missing value.
We first check how many missing values are in the Height feature and which class is it in.
(abalone_data['Height']==0).sum()
2
abalone_data[abalone_data['Height']==0]
| Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
|---|---|---|---|---|---|---|---|---|---|
| 1257 | I | 0.430 | 0.34 | 0.0 | 0.428 | 0.2065 | 0.0860 | 0.1150 | 8 |
| 3996 | I | 0.315 | 0.23 | 0.0 | 0.134 | 0.0575 | 0.0285 | 0.3505 | 6 |
The number of missing values is 2 and is in the infant sex.
Then we change the value 0 to null. We will fill in the missing value with the average Height feature for the infant gender.
mean = pd.pivot_table(abalone_data, index=['Sex'], aggfunc={'Height': np.mean})
mean
| Height | |
|---|---|
| Sex | |
| F | 0.158011 |
| I | 0.107996 |
| M | 0.151381 |
So we will fill in the missing value with 0.107996. (will perform the next step a little later)
Target Column¶
Next, take a look at the target in this case in the Rings column
abalone_data['Rings'].unique()
array([15, 7, 9, 10, 8, 20, 16, 19, 14, 11, 12, 18, 13, 5, 4, 6, 21,
17, 22, 1, 3, 26, 23, 29, 2, 27, 25, 24], dtype=int64)
abalone_data['Rings'].value_counts().sort_index()
1 1 2 1 3 15 4 57 5 115 6 259 7 391 8 568 9 689 10 634 11 487 12 267 13 203 14 126 15 103 16 67 17 58 18 42 19 32 20 26 21 14 22 6 23 9 24 2 25 1 26 1 27 2 29 1 Name: Rings, dtype: int64
We can see that the target is 1 to 29 (but there is no 28), so the classification we are goinng to do is a multi-class classification.
Storing data in the form of X & y¶
X = abalone_data.iloc[:,:-1]
y = abalone_data.iloc[:,-1]
X[:5]
| Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | |
|---|---|---|---|---|---|---|---|---|
| 0 | M | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 |
| 1 | M | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 |
| 2 | F | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 |
| 3 | M | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 |
| 4 | I | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 |
y[:5]
0 15 1 7 2 9 3 10 4 7 Name: Rings, dtype: int64
Splitting data into train and test sets.¶
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
Pipelining¶
We will use pipelines to perform preprocessing of the data, which will include: handling missing (or 0) values, scaling the features and handling the categorical feature (viz., sex in this case)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
Identifying numeric and categorical features (to be able to preprocess them differently.)
numeric_features = ['Length','Diameter','Height','Whole weight','Shucked weight', 'Viscera weight', 'Shell weight']
categorical_features = ['Sex']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=0, strategy='constant',fill_value=0.107996)),
('scaler',StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
transformers=[('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)]
)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
steps=[('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(max_depth=3, random_state=42))]
)
clf.fit(X_train, y_train)
print('Model score : {:.3f}'.format(clf.score(X_test,y_test)))
Model score : 0.245
Evaluation¶
y_pred = clf.predict(X_test)
Let us compare the actual and predicted values of y.
# comparision = np.concatenate(
# (y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1)
# for each in comparision:
# print(each)
Confusion Matrix by ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title('Confusion matrix')
plt.show()
Confusion Matrix by heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(confusion_matrix(y_test ,y_pred) ,annot=True , yticklabels=False ,cbar=False ,cmap='Greens')
<AxesSubplot:>
Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test ,y_pred))
precision recall f1-score support
3 0.00 0.00 0.00 5
4 0.26 0.45 0.33 11
5 0.38 0.42 0.40 33
6 0.00 0.00 0.00 47
7 0.41 0.57 0.48 98
8 0.21 0.38 0.27 113
9 0.21 0.19 0.20 127
10 0.20 0.46 0.28 107
11 0.16 0.15 0.16 95
12 0.00 0.00 0.00 66
13 0.00 0.00 0.00 39
14 0.00 0.00 0.00 26
15 0.00 0.00 0.00 18
16 0.00 0.00 0.00 14
17 0.00 0.00 0.00 10
18 0.00 0.00 0.00 5
19 0.00 0.00 0.00 8
20 0.00 0.00 0.00 8
21 0.00 0.00 0.00 2
22 0.00 0.00 0.00 1
23 0.00 0.00 0.00 2
29 0.00 0.00 0.00 1
accuracy 0.25 836
macro avg 0.08 0.12 0.10 836
weighted avg 0.17 0.25 0.20 836
Cross-Val Score
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10)
print('Accuracy of each fold : \n', list(acc*100))
print()
print('Accuracy : ',acc.mean()*100)
Accuracy of each fold : [27.46268656716418, 22.45508982035928, 23.952095808383234, 24.550898203592812, 24.251497005988025, 23.952095808383234, 25.449101796407188, 30.83832335329341, 26.34730538922156, 26.047904191616766] Accuracy : 25.530699794440963
Visualizing the decision tree¶
plt.figure(figsize=(30,10))
a = tree.plot_tree(clf['classifier'],
feature_names=column_names,
rounded=True,
filled=True,
fontsize=12)
plt.show()
Finding the best parameters using GridSearchCV¶
X_train_new = preprocessor.fit_transform(X_train)
tuned_parameters = [{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'min_samples_split':[2, 4, 6, 8, 10]}]
scores = ['recall']
for score in scores:
clf_cv = GridSearchCV(DecisionTreeClassifier(),
tuned_parameters,
scoring=f'{score}_macro')
clf_cv.fit(X_train_new, y_train)
print('Best parameters :' ,clf_cv.best_params_)
print()
print('Grid Score is as follows : \n')
means = clf_cv.cv_results_['mean_test_score']
stds = clf_cv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf_cv.cv_results_['params']):
print(f'{mean:0.3f} (+/-) {std*2:0.03f} for {params}')
Best parameters : {'max_depth': 5, 'min_samples_split': 2}
Grid Score is as follows :
0.073 (+/-) 0.009 for {'max_depth': 1, 'min_samples_split': 2}
0.073 (+/-) 0.009 for {'max_depth': 1, 'min_samples_split': 4}
0.073 (+/-) 0.009 for {'max_depth': 1, 'min_samples_split': 6}
0.073 (+/-) 0.009 for {'max_depth': 1, 'min_samples_split': 8}
0.073 (+/-) 0.009 for {'max_depth': 1, 'min_samples_split': 10}
0.107 (+/-) 0.011 for {'max_depth': 2, 'min_samples_split': 2}
0.107 (+/-) 0.011 for {'max_depth': 2, 'min_samples_split': 4}
0.107 (+/-) 0.011 for {'max_depth': 2, 'min_samples_split': 6}
0.107 (+/-) 0.011 for {'max_depth': 2, 'min_samples_split': 8}
0.107 (+/-) 0.011 for {'max_depth': 2, 'min_samples_split': 10}
0.131 (+/-) 0.012 for {'max_depth': 3, 'min_samples_split': 2}
0.131 (+/-) 0.012 for {'max_depth': 3, 'min_samples_split': 4}
0.131 (+/-) 0.012 for {'max_depth': 3, 'min_samples_split': 6}
0.131 (+/-) 0.012 for {'max_depth': 3, 'min_samples_split': 8}
0.131 (+/-) 0.012 for {'max_depth': 3, 'min_samples_split': 10}
0.134 (+/-) 0.022 for {'max_depth': 4, 'min_samples_split': 2}
0.134 (+/-) 0.022 for {'max_depth': 4, 'min_samples_split': 4}
0.134 (+/-) 0.022 for {'max_depth': 4, 'min_samples_split': 6}
0.134 (+/-) 0.022 for {'max_depth': 4, 'min_samples_split': 8}
0.134 (+/-) 0.022 for {'max_depth': 4, 'min_samples_split': 10}
0.144 (+/-) 0.018 for {'max_depth': 5, 'min_samples_split': 2}
0.144 (+/-) 0.018 for {'max_depth': 5, 'min_samples_split': 4}
0.144 (+/-) 0.018 for {'max_depth': 5, 'min_samples_split': 6}
0.144 (+/-) 0.018 for {'max_depth': 5, 'min_samples_split': 8}
0.143 (+/-) 0.017 for {'max_depth': 5, 'min_samples_split': 10}
0.130 (+/-) 0.031 for {'max_depth': 6, 'min_samples_split': 2}
0.136 (+/-) 0.025 for {'max_depth': 6, 'min_samples_split': 4}
0.136 (+/-) 0.024 for {'max_depth': 6, 'min_samples_split': 6}
0.136 (+/-) 0.025 for {'max_depth': 6, 'min_samples_split': 8}
0.135 (+/-) 0.023 for {'max_depth': 6, 'min_samples_split': 10}
0.142 (+/-) 0.021 for {'max_depth': 7, 'min_samples_split': 2}
0.137 (+/-) 0.033 for {'max_depth': 7, 'min_samples_split': 4}
0.138 (+/-) 0.034 for {'max_depth': 7, 'min_samples_split': 6}
0.138 (+/-) 0.033 for {'max_depth': 7, 'min_samples_split': 8}
0.134 (+/-) 0.031 for {'max_depth': 7, 'min_samples_split': 10}
0.130 (+/-) 0.034 for {'max_depth': 8, 'min_samples_split': 2}
0.128 (+/-) 0.035 for {'max_depth': 8, 'min_samples_split': 4}
0.130 (+/-) 0.035 for {'max_depth': 8, 'min_samples_split': 6}
0.128 (+/-) 0.035 for {'max_depth': 8, 'min_samples_split': 8}
0.127 (+/-) 0.034 for {'max_depth': 8, 'min_samples_split': 10}
0.130 (+/-) 0.036 for {'max_depth': 9, 'min_samples_split': 2}
0.128 (+/-) 0.036 for {'max_depth': 9, 'min_samples_split': 4}
0.127 (+/-) 0.038 for {'max_depth': 9, 'min_samples_split': 6}
0.129 (+/-) 0.032 for {'max_depth': 9, 'min_samples_split': 8}
0.128 (+/-) 0.031 for {'max_depth': 9, 'min_samples_split': 10}
Let us now create a new pipeline using the best features identified above.
clf2 = Pipeline(steps=[('preprocessor',preprocessor),
('classifier',DecisionTreeClassifier(max_depth=5,min_samples_split = 2, random_state=42))] )
clf2.fit(X_train, y_train)
print('Model score : {:.3f}'.format(clf2.score(X_test,y_test)))
Model score : 0.272
Decision Trees for Classification¶
In this half of the notebook, we will demonstrate decision trees for classification task with Iris dataset and DecisionTreeClassifier class in sklearn.
Let's load Iris dataset with load_irisAPI
from sklearn.datasets import load_iris
features, labels = load_iris(return_X_y=True, as_frame=True)
Let's split the data into train and test.
train_features, test_features, train_labels, test_labels = train_test_split(
features, labels, test_size=0.2, random_state=42)
Define the decision tree classifier as part of pipeline.
from sklearn.preprocessing import MinMaxScaler
dt_pipeline = Pipeline([('scaler', MinMaxScaler()),
('dt_classifier', DecisionTreeClassifier(max_depth=3,random_state=42))])
Training the classifier.
dt_pipeline.fit(train_features, train_labels)
Pipeline(steps=[('scaler', MinMaxScaler()),
('dt_classifier',
DecisionTreeClassifier(max_depth=3, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('dt_classifier',
DecisionTreeClassifier(max_depth=3, random_state=42))])MinMaxScaler()
DecisionTreeClassifier(max_depth=3, random_state=42)
Now that the classifier is trained, let's evaluate it on the test set with :
Confusion matrix
Classification report
ConfusionMatrixDisplay.from_estimator(dt_pipeline, test_features, test_labels)
plt.show()
print(classification_report(test_labels, dt_pipeline.predict(test_features)))
precision recall f1-score support
0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
As a next step let's visualize the trained decision tree model.
plt.figure(figsize=(20, 8))
a = tree.plot_tree(dt_pipeline[-1],
#use the feature names stored
feature_names=features.columns,
#use the class names stored
class_names=load_iris().target_names,
rounded=True,
filled=True,
fontsize=12)
plt.show()
Let's convert this tree representation into if-else rule set.
#export the decision rules
tree_rules = export_text(dt_pipeline[-1], feature_names=list(features.columns))
print(tree_rules)
|--- petal length (cm) <= 0.25 | |--- class: 0 |--- petal length (cm) > 0.25 | |--- petal length (cm) <= 0.66 | | |--- petal width (cm) <= 0.65 | | | |--- class: 1 | | |--- petal width (cm) > 0.65 | | | |--- class: 2 | |--- petal length (cm) > 0.66 | | |--- petal width (cm) <= 0.69 | | | |--- class: 1 | | |--- petal width (cm) > 0.69 | | | |--- class: 2
Let's get the feature importance from the trained decision tree model.
importance = pd.DataFrame({'features': features.columns,
'importance': np.round(dt_pipeline[-1].feature_importances_, 4)})
importance.sort_values('importance', ascending=False, inplace=True)
print(importance)
features importance 2 petal length (cm) 0.9346 3 petal width (cm) 0.0654 0 sepal length (cm) 0.0000 1 sepal width (cm) 0.0000
Now, perform HPT using GridSearchCV :
There are two configurable parameters in the tree classifier :
max_depthmin_samples_split
param_grid = [{'dt_classifier__max_depth': [1, 2, 3, 4, 5],
'dt_classifier__min_samples_split': [2, 4, 6, 8, 10]}]
gs_clf = GridSearchCV(dt_pipeline, param_grid, scoring='f1_macro')
gs_clf.fit(train_features, train_labels)
print('Best parameters : ', gs_clf.best_params_)
print()
print('Grid scores are as follows : \n')
means = gs_clf.cv_results_['mean_test_score']
stds = gs_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']):
print(f'{mean:0.3f} (+/-) {std*2:0.03f} for {params}\n')
Best parameters : {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 2}
Grid scores are as follows :
0.559 (+/-) 0.014 for {'dt_classifier__max_depth': 1, 'dt_classifier__min_samples_split': 2}
0.559 (+/-) 0.014 for {'dt_classifier__max_depth': 1, 'dt_classifier__min_samples_split': 4}
0.559 (+/-) 0.014 for {'dt_classifier__max_depth': 1, 'dt_classifier__min_samples_split': 6}
0.559 (+/-) 0.014 for {'dt_classifier__max_depth': 1, 'dt_classifier__min_samples_split': 8}
0.559 (+/-) 0.014 for {'dt_classifier__max_depth': 1, 'dt_classifier__min_samples_split': 10}
0.916 (+/-) 0.091 for {'dt_classifier__max_depth': 2, 'dt_classifier__min_samples_split': 2}
0.916 (+/-) 0.091 for {'dt_classifier__max_depth': 2, 'dt_classifier__min_samples_split': 4}
0.916 (+/-) 0.091 for {'dt_classifier__max_depth': 2, 'dt_classifier__min_samples_split': 6}
0.916 (+/-) 0.091 for {'dt_classifier__max_depth': 2, 'dt_classifier__min_samples_split': 8}
0.916 (+/-) 0.091 for {'dt_classifier__max_depth': 2, 'dt_classifier__min_samples_split': 10}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 3, 'dt_classifier__min_samples_split': 2}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 3, 'dt_classifier__min_samples_split': 4}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 3, 'dt_classifier__min_samples_split': 6}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 3, 'dt_classifier__min_samples_split': 8}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 3, 'dt_classifier__min_samples_split': 10}
0.941 (+/-) 0.115 for {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 2}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 4}
0.941 (+/-) 0.115 for {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 6}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 8}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 4, 'dt_classifier__min_samples_split': 10}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 5, 'dt_classifier__min_samples_split': 2}
0.924 (+/-) 0.112 for {'dt_classifier__max_depth': 5, 'dt_classifier__min_samples_split': 4}
0.941 (+/-) 0.115 for {'dt_classifier__max_depth': 5, 'dt_classifier__min_samples_split': 6}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 5, 'dt_classifier__min_samples_split': 8}
0.932 (+/-) 0.115 for {'dt_classifier__max_depth': 5, 'dt_classifier__min_samples_split': 10}
Confusion matrix for the best estimator obtained through the GridSearchCV.
ConfusionMatrixDisplay.from_estimator(
gs_clf.best_estimator_, test_features, test_labels)
plt.show()
Objective¶
In this notebook, we will implement multiclass MNIST digit recognition classifier with decision trees and ensemble techniques.
Importing basic libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# dataset loading through mnist
from keras.datasets import mnist
#training three classifiers: decision tree, bagging and random forest.
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# model selection utilitities for training and test split and cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
Loading MNIST dataset¶
We begin by loading the MNIST dataset with load_data in mnist class.
We obtain :
Training feature matrix and labels
Test feature matrix and labels
(X_train, y_train), (X_test, y_test) = mnist.load_data()
data = mnist.load_data()
plt.imshow(X_train[0])
plt.show()
There are 60000 examples in training set and 10000 examples in the test set.
Each example is a grey scale image of size 28 X 28.
There are 10 different labels - one for each digit - 0 to 9.
print('Shape of training data : ', X_train.shape)
print('Shape of training labels : ', y_train.shape)
print()
print('Shape of testing data : ', X_test.shape)
print('Shape of testing labels : ', y_test.shape)
Shape of training data : (60000, 28, 28) Shape of training labels : (60000,) Shape of testing data : (10000, 28, 28) Shape of testing labels : (10000,)
Before using the dataset for training and evaluation, we need to flatten it into a vector.
After flattening, we have training and test examples represented with a vector of 784 features.
Each feature records pixel intensity in each of 28 X 28 pixel.
We normalize the pixel intensity by dividing it with the maximum value i.e. 255. In that sense we have each feature value in the range 0 to 1.
# Flatten each input image into a vector of length 784.
X_train = X_train.reshape(X_train.shape[0], 28*28)
X_test = X_test.reshape(X_test.shape[0], 28*28)
# Normalizing
X_train = X_train/255
X_test = X_test/255
print('Shape of training data after flattening : ', X_train.shape)
print('Shape of testing data after flattening : ', X_test.shape)
Shape of training data after flattening : (60000, 784) Shape of testing data after flattening : (10000, 784)
We use ShuffleSplit cross validation with 10 splits and 20% data set aside for model evaluation as a test data .
cv = ShuffleSplit(n_splits=10,test_size=0.2, random_state=42)
Model Building¶
We define two functions :
- train_classifiers function :
It contains a common code for training classifiers for MNIST multiclass classification problem.
It takes
estimator, feature matrix, labels, cross validation strategy and name of the classifier as input.It first fits the estimator with feature matrix and labels.
It obtains cross validated
f1_macroscore for training set with 10-foldShuffleSplitcross validation and prints it.
def train_classifiers(estimator, X_train, y_train, cv, name):
estimator.fit(X_train, y_train)
cv_train_score = cross_val_score(
estimator, X_train, y_train, cv=cv, scoring='f1_macro')
print(
f'On an average, {name} model has f1 score of 'f'{cv_train_score.mean():.3f} (+/-) {cv_train_score.std():.3f} on the training set')
evalfunction :
It takes estimator, test feature matrix and labels as input and produce classification report and confusion matrix.
It first predicts labels for the test set.
Then it uses these predicted reports for calculating various evaluation metrics like precision, recall, f1 score and accuracy for each of the 10 classes.
It also obtains a confusion matrix by comparing these predictions iwth the actual labels and displays it with
ConfusionMatrixDisplayutility.
def eval(estimator, X_test ,y_test):
y_pred = estimator.predict(X_test)
print('Classification Report :')
print(classification_report(y_test ,y_pred))
print('Confusion Matrix : ')
sns.heatmap(confusion_matrix(y_test ,y_pred) ,cmap='Blues',annot=True ,cbar=True ,fmt='.5g')
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()
Let's train three classifiers with default parameters.
Decision tree
Bagging classifier - which uses decision tree as a default classifier and trains multiple decision tree classifiers on different bags obtained through boostrap sampling of training set.
Random forest classifier - which is also a bagging technique, which trains different decision tree classifiers by randomly selecting attributes for splitting on bags of boostrap sample of training set.
Decision trees for MNIST multiclass classification¶
We instantiate a decision tree classifier with default parameters and train it.
The train_classifier function prints the mean of cross validated accuracy and standard deviation of the trained classifier on the training set.
dt_pipeline = Pipeline([('classifier', DecisionTreeClassifier())])
train_classifiers(dt_pipeline, X_train,
y_train.ravel(), cv, 'decision tree')
On an average, decision tree model has f1 score of 0.867 (+/-) 0.004 on the training set
Let's evaluate the trained classifier on the test set.
eval(dt_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.92 0.93 0.93 980
1 0.95 0.96 0.96 1135
2 0.86 0.85 0.85 1032
3 0.83 0.85 0.84 1010
4 0.87 0.88 0.87 982
5 0.83 0.83 0.83 892
6 0.89 0.88 0.88 958
7 0.91 0.89 0.90 1028
8 0.83 0.80 0.81 974
9 0.86 0.85 0.85 1009
accuracy 0.87 10000
macro avg 0.87 0.87 0.87 10000
weighted avg 0.87 0.87 0.87 10000
Confusion Matrix :
MNIST classification with Bagging¶
First instantiate a bagging classifier object with default parameters and train it.
Observe the mean f1_score and its standard deviation obtained by the classifier based 10-fold cross validation of the training set.
# bagging_pipeline = Pipeline([('scaler',MinMaxScaler()),('classifier', BaggingClassifier())])
bag_pipeline = Pipeline([('classifier', BaggingClassifier())])
train_classifiers(bag_pipeline, X_train, y_train.ravel(), cv, 'bagging')
On an average, bagging model has f1 score of 0.938 (+/-) 0.003 on the training set
Let's evaluate the trained classifier on the test set.
eval(bag_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.94 0.98 0.96 980
1 0.97 0.99 0.98 1135
2 0.92 0.94 0.93 1032
3 0.93 0.93 0.93 1010
4 0.94 0.94 0.94 982
5 0.94 0.91 0.93 892
6 0.96 0.95 0.96 958
7 0.96 0.94 0.95 1028
8 0.93 0.91 0.92 974
9 0.94 0.93 0.93 1009
accuracy 0.94 10000
macro avg 0.94 0.94 0.94 10000
weighted avg 0.94 0.94 0.94 10000
Confusion Matrix :
Random forest for MNIST multiclass classification¶
Let's instantiate a random forest classifier object with default parameters and train it.
Observe the mean f1_score and its standard deviation obtained by the classifier based 10-fold cross validation of the training set.
rf_pipeline = Pipeline([('classifier', RandomForestClassifier())])
train_classifiers(rf_pipeline,X_train, y_train.ravel(), cv, 'random forest')
On an average, random forest model has f1 score of 0.967 (+/-) 0.001 on the training set
Now let's evaluate a random forest classifier on the test set and obtain classification report containing precision, recall, f1-score and accuracy for each class.
It also calculates confusion matrix and displays it with seaborn heatmap utility.
eval(rf_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.97 0.99 0.98 980
1 0.99 0.99 0.99 1135
2 0.96 0.97 0.96 1032
3 0.96 0.96 0.96 1010
4 0.97 0.98 0.97 982
5 0.97 0.96 0.97 892
6 0.97 0.98 0.98 958
7 0.97 0.96 0.97 1028
8 0.96 0.96 0.96 974
9 0.96 0.95 0.96 1009
accuracy 0.97 10000
macro avg 0.97 0.97 0.97 10000
weighted avg 0.97 0.97 0.97 10000
Confusion Matrix :
Summary¶
We trained three multi-class classifiers for handwritten digit recognition.
The decision tree classifier is a baseline classifier, which obtained accuracy of 87% on the test set.
Using bagging and training the same decision tree classifier gave us an increase of 7 percentage point in the accuracy, which translates to 94% accuracy on the test set.
Finally, the random forest classifier pushed that further to 97%.
We can see that how ensemble techniques give better results on the classification task compared to a single classifier.
Objective¶
In this notebook, we will apply ensemble techniques regression problem in california housing dataset.
We have already applied different regressors on california housing dataset. In this notebook, we will make use of :
Decision tree regressor
Bagging regressor
Random Forest regressor
We will observe the performance improvement when we use random forest over decision trees and bagging, which also uses decision tree regressor.
Importing basic libraries¶
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
np.random.seed(306)
Let's use ShuffleSplit as cv with 10 splits and 20% examples set aside as text examples.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
Let's download the data and split it into training and test sets.
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *= 100
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
train_features, dev_features, train_labels, dev_labels = train_test_split(
com_train_features, com_train_labels, random_state=42)
Training different Regressors¶
Let's train different regressors :
def train_regressor(estimator, X_train, y_train, cv, name):
cv_results = cross_validate(estimator,
X_train,
y_train,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
cv_train_error = -1 * cv_results['train_score']
cv_test_error = -1 * cv_results['test_score']
print(f'On an average, {name} makes an error of ',
f'{cv_train_error.mean():.3f} (+/-) {cv_train_error.std():.3f} on the training set.')
print(f'On an average, {name} makes an error of ',
f'{cv_test_error.mean():.3f} (+/-) {cv_test_error.std():.3f} on the testing set.')
Decision Tree Regressor¶
train_regressor(DecisionTreeRegressor() ,com_train_features, com_train_labels ,cv, 'decision tree')
On an average, decision tree makes an error of 0.000 (+/-) 0.000 on the training set. On an average, decision tree makes an error of 47.456 (+/-) 1.125 on the testing set.
Bagging Regressor¶
train_regressor(BaggingRegressor(), com_train_features, com_train_labels, cv, 'bagging regressor')
On an average, bagging regressor makes an error of 14.453 (+/-) 0.167 on the training set. On an average, bagging regressor makes an error of 35.373 (+/-) 0.943 on the testing set.
Random Forest Regressor¶
train_regressor(RandomForestRegressor(), com_train_features, com_train_labels, cv, 'random forest regressor')
On an average, random forest regressor makes an error of 12.654 (+/-) 0.070 on the training set. On an average, random forest regressor makes an error of 33.208 (+/-) 0.710 on the testing set.
Parameter search for random-forest-regressor¶
param_grid = {
'n_estimators': [1, 2, 5, 10, 20, 50, 100, 200, 500],
'max_leaf_nodes': [2, 5, 10, 20, 50, 100]
}
search_cv = RandomizedSearchCV(
RandomForestRegressor(n_jobs=2), param_grid,
scoring='neg_mean_absolute_error', n_iter=10, random_state=0, n_jobs=-1,)
search_cv.fit(com_train_features, com_train_labels)
RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=2), n_jobs=-1,
param_distributions={'max_leaf_nodes': [2, 5, 10, 20, 50,
100],
'n_estimators': [1, 2, 5, 10, 20, 50,
100, 200, 500]},
random_state=0, scoring='neg_mean_absolute_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=2), n_jobs=-1,
param_distributions={'max_leaf_nodes': [2, 5, 10, 20, 50,
100],
'n_estimators': [1, 2, 5, 10, 20, 50,
100, 200, 500]},
random_state=0, scoring='neg_mean_absolute_error')RandomForestRegressor(n_jobs=2)
RandomForestRegressor(n_jobs=2)
columns = [f'param_{name}' for name in param_grid.keys()]
columns += ['mean_test_error', 'std_test_error']
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results['mean_test_error'] = -cv_results['mean_test_score']
cv_results['std_test_error'] = cv_results['std_test_score']
cv_results[columns].sort_values(by='mean_test_error')
| param_n_estimators | param_max_leaf_nodes | mean_test_error | std_test_error | |
|---|---|---|---|---|
| 0 | 500 | 100 | 40.594643 | 0.703924 |
| 2 | 10 | 100 | 40.901778 | 0.821947 |
| 7 | 100 | 50 | 43.889529 | 0.767655 |
| 8 | 1 | 100 | 45.292752 | 0.983549 |
| 9 | 10 | 20 | 49.497432 | 0.934705 |
| 6 | 50 | 20 | 49.512017 | 1.109143 |
| 1 | 100 | 20 | 49.562485 | 1.050607 |
| 3 | 500 | 10 | 54.974162 | 1.081898 |
| 4 | 5 | 5 | 61.522427 | 1.371272 |
| 5 | 5 | 2 | 72.953346 | 1.245182 |
error = - search_cv.score(test_features, test_labels)
print(f'On average, our random forest regressor makes an error of {error:.2f}.')
On average, our random forest regressor makes an error of 40.47.
Objective¶
In this notebook, we will implement multiclass MNIST digit recognition classifier with boosting :
AdaBoost
GradientBoosting
XGBoost
Importing basic libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from keras.datasets import mnist
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.pipeline import Pipeline
Loading MNIST dataset¶
Begin by loading MNIST dataset with load_data function in mnist class.
We obtain:
Training feature matrix and labels
Test feature matrix and labels
(X_train, y_train), (X_test, y_test) = mnist.load_data()
data = mnist.load_data()
There are 60000 examples in training set and 10000 examples in the test set.
Each example is a grey scale image of size 28 X 28.
There are 10 different labels - one for each digit - 0 to 9.
# Flatten each input image into a vector of length 784
X_train = X_train.reshape(X_train.shape[0], 28*28)
X_test = X_test.reshape(X_test.shape[0], 28*28)
# Normalizing
X_train = X_train / 255
X_test = X_test / 255
We use ShuffleSplit cross validation with 10 splits and 20% data set aside for model evaluation as a test data .
cv = ShuffleSplit(n_splits=10,test_size=0.2, random_state=42)
Model Building¶
We define two functions :
def train_classifiers(estimator, X_train, y_train, cv, name):
estimator.fit(X_train, y_train)
cv_train_score = cross_val_score(
estimator, X_train, y_train, cv=cv, scoring='f1_macro')
print(
f'On an average, {name} model has f1 score of 'f'{cv_train_score.mean():.3f} (+/-) {cv_train_score.std():.3f} on the training set')
def eval(estimator, X_test, y_test):
y_pred = estimator.predict(X_test)
print('Classification Report :')
print(classification_report(y_test, y_pred))
print('Confusion Matrix : ')
sns.heatmap(confusion_matrix(y_test, y_pred),
cmap='Greens', annot=True, cbar=True, fmt='.5g')
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()
AdaBoost for MNIST multiclass classification¶
We instantiate a decision tree classifier with default parameters and train it.
The train_classifier function prints the means of cross validated accuracy and standard deviation of the trained classifier on the training set.
adb_pipeline = Pipeline([('classifier', AdaBoostClassifier())])
train_classifiers(adb_pipeline, X_train, y_train.ravel(),
cv, 'AdaBoostClassifier')
On an average, AdaBoostClassifier model has f1 score of 0.712 (+/-) 0.016 on the training set
eval(adb_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.84 0.90 0.87 980
1 0.85 0.94 0.90 1135
2 0.79 0.58 0.67 1032
3 0.68 0.67 0.68 1010
4 0.72 0.72 0.72 982
5 0.69 0.59 0.64 892
6 0.72 0.86 0.78 958
7 0.68 0.78 0.73 1028
8 0.66 0.68 0.67 974
9 0.62 0.55 0.58 1009
accuracy 0.73 10000
macro avg 0.73 0.73 0.72 10000
weighted avg 0.73 0.73 0.73 10000
Confusion Matrix :
GradientBoostingClassifier for MNIST classification¶
Let's instantiate a gradient boosting classifier object with default parameters and train it.
Observe the mean f1_score and its standard deviation obtained by the classifier based 10-fold cross validation of the training set.
grb_pipeline = Pipeline(
[('classifier', GradientBoostingClassifier(n_estimators=10))])
train_classifiers(grb_pipeline, X_train, y_train.ravel(),
cv, 'GradientBoostingClassifier')
On an average, GradientBoostingClassifier model has f1 score of 0.835 (+/-) 0.003 on the training set
Let's evaluate the trained classifier on the test set.
eval(grb_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.89 0.96 0.93 980
1 0.89 0.95 0.92 1135
2 0.90 0.83 0.86 1032
3 0.81 0.84 0.83 1010
4 0.79 0.85 0.82 982
5 0.87 0.70 0.78 892
6 0.93 0.85 0.89 958
7 0.91 0.82 0.86 1028
8 0.76 0.81 0.78 974
9 0.75 0.83 0.79 1009
accuracy 0.85 10000
macro avg 0.85 0.84 0.84 10000
weighted avg 0.85 0.85 0.85 10000
Confusion Matrix :
XGBoost Classifier for MNIST classification¶
from xgboost import XGBClassifier
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
xgbc_pipeline = Pipeline([("classifier",XGBClassifier())])
train_classifiers(xgbc_pipeline,X_train, y_train.ravel(), cv, 'GradientBoostingClassifier')
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:01:16] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [20:04:43] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:07:37] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:10:27] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:13:23] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:16:11] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:19:02] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:21:51] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:24:40] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:27:29] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
[20:30:20] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. On an average, GradientBoostingClassifier model has f1 score of 0.976 (+/-) 0.001 on the training set
eval(xgbc_pipeline, X_test, y_test)
Classification Report :
precision recall f1-score support
0 0.97 0.99 0.98 980
1 0.99 0.99 0.99 1135
2 0.97 0.97 0.97 1032
3 0.97 0.98 0.98 1010
4 0.98 0.97 0.98 982
5 0.98 0.97 0.98 892
6 0.98 0.98 0.98 958
7 0.98 0.97 0.97 1028
8 0.97 0.97 0.97 974
9 0.97 0.97 0.97 1009
accuracy 0.98 10000
macro avg 0.98 0.98 0.98 10000
weighted avg 0.98 0.98 0.98 10000
Confusion Matrix :
Summary¶
We trained three multi-class classifiers for handwritten digit recognition.
Firstly, the AdaBoost classifier obtained an accuracy of 71% on the test set.
Next ,using Gradient boosting clssifier gave us an increase of 12 percentage in the accuracy, which translates to 83% accuracy on the test set.
Finally, the XGBoost classifier pushed that further to 97%.
We can see that how ensemble techniques give better results on the classification task compared to a single classifier.
Objective¶
In this notebook, we will apply ensemble techniques regression problem in california housing dataset.
We have already applied different regressors on california housing dataset. In this notebook, we will make use of :
AdaBoost regressor
Gradient Boosting regressor
XGBoost regressor
Importing basic libraries¶
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
import warnings
warnings.filterwarnings('ignore')
c:\Users\faizan\anaconda3\envs\tensorflow\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
np.random.seed(306)
Let's use ShuffleSplit as cv with 10 splits and 20% examples set aside as text examples.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
Let's download the data and split it into training and test sets.
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *= 100
com_train_features, test_features, com_train_labels, test_labels = train_test_split(
features, labels, random_state=42)
train_features, dev_features, train_labels, dev_labels = train_test_split(
com_train_features, com_train_labels, random_state=42)
Training different Regressors¶
Let's train different regressors :
def train_regressor(estimator, X_train, y_train, cv, name):
cv_results = cross_validate(estimator,
X_train,
y_train,
cv=cv,
scoring='neg_mean_absolute_error',
return_train_score=True,
return_estimator=True)
cv_train_error = -1 * cv_results['train_score']
cv_test_error = -1 * cv_results['test_score']
print(f'On an average, {name} makes an error of ',
f'{cv_train_error.mean():.3f} (+/-) {cv_train_error.std():.3f} on the training set.')
print(f'On an average, {name} makes an error of ',
f'{cv_test_error.mean():.3f} (+/-) {cv_test_error.std():.3f} on the testing set.')
AdaBoost Regressor¶
train_regressor(AdaBoostRegressor(), com_train_features,com_train_labels, cv, 'AdaBoostRegressor')
On an average, AdaBoostRegressor makes an error of 73.263 (+/-) 6.031 on the training set. On an average, AdaBoostRegressor makes an error of 73.623 (+/-) 6.057 on the testing set.
Gradient Boosting Regressor¶
train_regressor(GradientBoostingRegressor(), com_train_features,
com_train_labels, cv, 'GradientBoostingRegressor')
On an average, GradientBoostingRegressor makes an error of 35.394 (+/-) 0.273 on the training set. On an average, GradientBoostingRegressor makes an error of 36.773 (+/-) 0.723 on the testing set.
XGBoost Regressor¶
train_regressor(XGBRegressor(), com_train_features,
com_train_labels, cv, 'XGBoostRegressor')
On an average, XGBoostRegressor makes an error of 18.308 (+/-) 0.182 on the training set. On an average, XGBoostRegressor makes an error of 31.845 (+/-) 0.753 on the testing set.
Clustering¶
Clustering is concerned about grouping objects with similar attributes or characteristics
The objects in the same cluster are closer to one another than the objects from the other clusters.
In the image above, the clusters with same color share similar properties(feature values represented on axis).
For instance, if the x-axis represents weight and y-axis represents height, the yellow color cluster represents people with low BMI.
Similar interpretations can be drawn for the remaining clusters.
Hierarchical Agglomerative Clustering (HAC)¶
Earlier in this week, we studied k-means clustering algorithm.
In this notebook, we will discuss another clustering algorithm which is Hierarchical agglomerative clustering (HAC) algorithm.
Hierarchical clustering start by considering each datapoint as a cluster and then combines closest clusters to form larger clusters i.e it follows a bottoms-up approach.
There is an alternate approach, which is top-down approach, where the entire data is considered as a one single cluster, which is divided to form smaller clusters in each step.
This is another type of hierarchical clustering also known as Divisive Hierarchical Clustering (DHC).
The merging and splitting decisions are influenced by certain conditions that will be discussed shortly.
Metrics¶
Certain metrics are used for calculating similarity between clusters.
Note: Metric is a generalization of concept of distance.
The metrics follow certain properties like :
non-negative
sysmetric
follows triangle inequality
Some of the popular metric functions are :
- Euclidean distance -
\begin{align} d(x^{(i)} , x^{(j)}) = \sqrt{\sum{^m _{l=1}} {(x_l^{(i)} - x_l^{(j)})^2}} \end{align}
- Manhattan distance -
\begin{align} d(x^{(i)} , x^{(j)}) = \sum{^m _{l=1}} {\left\lvert(x_l^{(i)} - x_l^{(j)})\right\rvert} \end{align}
- Cosine distance -
\begin{align} d(x^{(i)} , x^{(j)}) = 1 - \frac{x^{(i)}. x^{(j)}}{\left\lvert \left\lvert x^{(i)} \right\rvert \right\rvert \left\lvert \left\lvert x^{(j)} \right\rvert \right\rvert} = 1 - \cos{\theta} \end{align}
Linkage¶
Linkage is a strategy for aggregating clusters.
There are four linkages that we will study :
Single linkage
Average linkage
Complete linkage
Ward's linkage
The Single linkage criterion merges clusters based on the shortest distance over all possible pairs i.e.
$ \left ({ \mathbf \{ x_{r_1}^{(i)}\}_{i=1}^{|r_1|} },{\mathbf \{ x_{r_2}^{(j)}\}_{j=1}^{|r_2|} } \right) = \text {min}_{i,j} d\left(\mathbf x_{r_1}^{(i)}, \mathbf x_{r_2}^{(j)}\right) $

The Complete linkage merges clusters to minimize the maximum distance between the clusters (in other words, the distance of the furthest elements)
$ \left ({ \mathbf \{ x_{r_1}^{(i)}\}_{i=1}^{|r_1|} },{\mathbf \{ x_{r_2}^{(j)}\}_{j=1}^{|r_2|} } \right) = \text {max}_{i,j} d\left(\mathbf x_{r_1}^{(i)}, \mathbf x_{r_2}^{(j)}\right) $

The average linkage criterion uses average distance over all possible pairs between the groups for merging clusters.
$ \left ({ \mathbf \{ x_{r_1}^{(i)}\}_{i=1}^{|r_1|} },{\mathbf \{ x_{r_2}^{(j)}\}_{j=1}^{|r_2|} } \right) = \frac {1}{|r_1r_2|} \sum_{i=1}^{|r_1|} \sum_{j=1}^{|r_2|} d\left(\mathbf x_{r_1}^{(i)}, \mathbf x_{r_2}^{(j)}\right) $

Ward's linkage
It computes the sum of squared distances withing the clusters.
$ \left ({ \mathbf \{ x_{r_1}^{(i)}\}_{i=1}^{|r_1|} } , {\mathbf \{ x_{r_2}^{(j)}\}_{j=1}^{|r_2|} } \right) = \sum_{i=1}^{|r_1|} \sum_{j=1}^{|r_2|} ||(\mathbf x_{r_1}^{(i)} - \mathbf x_{r_2}^{(j)} ||^2 $
Algorithm :¶
Calculate the distance matrix between pairs of clusters.
While all the objects are clustered into one.
- Detect the two closest groups (clusters) and merge them.
Dendrogram¶
Dendrograms are a graphical representation of the agglomerative process which shows a how aggregation happens at each level.
Importing Libraries¶
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns ;sns.set()
from sklearn.preprocessing import normalize
Lets take example of a toy dataset to understand this :
X = np.array([(8, 3), (5, 3), (6, 4), (1, 6), (2, 8)])
X_scaled = normalize(X)
plt.scatter(X[:, 0], X[:, 1])
plt.show()
Let's plot the dendrogram with scipy.cluster.hierarchy library
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(8, 8))
plt.title('Dendrogram')
dend = sch.dendrogram(sch.linkage(X_scaled, method='ward'))
HAC is implemented in sklearn.cluster module as AgglomerativeClustering class.
Objective¶
In this notebook, we will implement k-means algorithm using sklearn.
Importing Libraries¶
from IPython.display import display, Math, Latex
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
Clustering of digits¶
We will use digit dataset for clustering, which is loaded through the load_digits API.
It loads
8x8digit images with approximately 180 samples per class.From 10 classes, it has total of 1797 images.
Each pixel has value between 0 and 16.
digits = load_digits()
data = np.column_stack((digits.data, digits.target))
columns = np.append(digits.feature_names, ['targets'])
df_digits = pd.DataFrame(data, columns=columns)
df_digits.head()
| pixel_0_0 | pixel_0_1 | pixel_0_2 | pixel_0_3 | pixel_0_4 | pixel_0_5 | pixel_0_6 | pixel_0_7 | pixel_1_0 | pixel_1_1 | ... | pixel_6_7 | pixel_7_0 | pixel_7_1 | pixel_7_2 | pixel_7_3 | pixel_7_4 | pixel_7_5 | pixel_7_6 | pixel_7_7 | targets | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 5.0 | 13.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 6.0 | 13.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 0.0 | 0.0 | 12.0 | 13.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 16.0 | 10.0 | 0.0 | 0.0 | 1.0 |
| 2 | 0.0 | 0.0 | 0.0 | 4.0 | 15.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 11.0 | 16.0 | 9.0 | 0.0 | 2.0 |
| 3 | 0.0 | 0.0 | 7.0 | 15.0 | 13.0 | 1.0 | 0.0 | 0.0 | 0.0 | 8.0 | ... | 0.0 | 0.0 | 0.0 | 7.0 | 13.0 | 13.0 | 9.0 | 0.0 | 0.0 | 3.0 |
| 4 | 0.0 | 0.0 | 0.0 | 1.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 16.0 | 4.0 | 0.0 | 0.0 | 4.0 |
5 rows × 65 columns
df_digits.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1797 entries, 0 to 1796 Data columns (total 65 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pixel_0_0 1797 non-null float64 1 pixel_0_1 1797 non-null float64 2 pixel_0_2 1797 non-null float64 3 pixel_0_3 1797 non-null float64 4 pixel_0_4 1797 non-null float64 5 pixel_0_5 1797 non-null float64 6 pixel_0_6 1797 non-null float64 7 pixel_0_7 1797 non-null float64 8 pixel_1_0 1797 non-null float64 9 pixel_1_1 1797 non-null float64 10 pixel_1_2 1797 non-null float64 11 pixel_1_3 1797 non-null float64 12 pixel_1_4 1797 non-null float64 13 pixel_1_5 1797 non-null float64 14 pixel_1_6 1797 non-null float64 15 pixel_1_7 1797 non-null float64 16 pixel_2_0 1797 non-null float64 17 pixel_2_1 1797 non-null float64 18 pixel_2_2 1797 non-null float64 19 pixel_2_3 1797 non-null float64 20 pixel_2_4 1797 non-null float64 21 pixel_2_5 1797 non-null float64 22 pixel_2_6 1797 non-null float64 23 pixel_2_7 1797 non-null float64 24 pixel_3_0 1797 non-null float64 25 pixel_3_1 1797 non-null float64 26 pixel_3_2 1797 non-null float64 27 pixel_3_3 1797 non-null float64 28 pixel_3_4 1797 non-null float64 29 pixel_3_5 1797 non-null float64 30 pixel_3_6 1797 non-null float64 31 pixel_3_7 1797 non-null float64 32 pixel_4_0 1797 non-null float64 33 pixel_4_1 1797 non-null float64 34 pixel_4_2 1797 non-null float64 35 pixel_4_3 1797 non-null float64 36 pixel_4_4 1797 non-null float64 37 pixel_4_5 1797 non-null float64 38 pixel_4_6 1797 non-null float64 39 pixel_4_7 1797 non-null float64 40 pixel_5_0 1797 non-null float64 41 pixel_5_1 1797 non-null float64 42 pixel_5_2 1797 non-null float64 43 pixel_5_3 1797 non-null float64 44 pixel_5_4 1797 non-null float64 45 pixel_5_5 1797 non-null float64 46 pixel_5_6 1797 non-null float64 47 pixel_5_7 1797 non-null float64 48 pixel_6_0 1797 non-null float64 49 pixel_6_1 1797 non-null float64 50 pixel_6_2 1797 non-null float64 51 pixel_6_3 1797 non-null float64 52 pixel_6_4 1797 non-null float64 53 pixel_6_5 1797 non-null float64 54 pixel_6_6 1797 non-null float64 55 pixel_6_7 1797 non-null float64 56 pixel_7_0 1797 non-null float64 57 pixel_7_1 1797 non-null float64 58 pixel_7_2 1797 non-null float64 59 pixel_7_3 1797 non-null float64 60 pixel_7_4 1797 non-null float64 61 pixel_7_5 1797 non-null float64 62 pixel_7_6 1797 non-null float64 63 pixel_7_7 1797 non-null float64 64 targets 1797 non-null float64 dtypes: float64(65) memory usage: 912.7 KB
Some of the important parameters of KMeans are as follows :
initn_initmax_iterrandom_state
Since KMeans algorithm is susceptible to local minima, we perform multiple KMeans fit and select the ones with the lowest value of sum of squared error.
The total number of time, we would like to run
KMeansalgorithm is specified throughn_initparameter.max_iterspecifies total number of iterations to perform before declaring the convergence.
Let's define parameters of KMeans clustering algorithm in a dictionary object.
kmeans_kwargs = {
'init': 'random',
'n_init': 50,
'max_iter': 500,
'random_state': 0
}
Model Building¶
Let's define a pipeline with two stages :
preprocessing for feature scaling with
MinMaxScaler.clustering with
KMeansclustering algorithm.
pipeline = Pipeline([('scaler', MinMaxScaler()),
('clustering', KMeans(n_clusters=10, **kmeans_kwargs))])
pipeline.fit(digits.data)
Pipeline(steps=[('scaler', MinMaxScaler()),
('clustering',
KMeans(init='random', max_iter=500, n_clusters=10, n_init=50,
random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', MinMaxScaler()),
('clustering',
KMeans(init='random', max_iter=500, n_clusters=10, n_init=50,
random_state=0))])MinMaxScaler()
KMeans(init='random', max_iter=500, n_clusters=10, n_init=50, random_state=0)
The cluster centroids can be accessed via cluster_centers_ member variable of KMeans class
cluster_centers = pipeline[-1].cluster_centers_
cluster_centers.shape
(10, 64)
Dispalying the cluster centroids.¶
fig, ax = plt.subplots(5, 2, figsize=(8,8))
for i, j in zip(ax.flat, cluster_centers.reshape(10, 8, 8)):
i.imshow(j)
fig, ax = plt.subplots(2, 5, figsize=(8,8))
for i, j in zip(ax.flat, cluster_centers.reshape(10, 8, 8)):
i.imshow(j)
In this case, the number of clusters were known, hence we set k=10 and got the clusters.
For deciding the optimal number of clusters through elbow and silhouette, we will pretend that we do not know the clusters in the data and we will try to discover the optimal number of clusters through these two methods one by one:¶
Elbow method¶
Here we keep track of sum-of-squared error (SSE) in a list for each value of k.
sse_digit = []
scaled_digits = MinMaxScaler().fit_transform(digits.data)
for k in range(1, 12):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_digits)
sse_digit.append(kmeans.inertia_)
Note that the SSE for a given clustering output is obtained through inertia_.
plt.plot(range(1, 12), sse_digit)
plt.xticks(range(1, 15))
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()
There is a slight elbow at k=9, which could point to the fact that a few digits may have been merged in one cluster.
Silhoutte Score¶
sil_coef_digits = []
for k in range(2, 15):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_digits)
score = silhouette_score(digits.data, kmeans.labels_)
sil_coef_digits.append(score)
plt.plot(range(2, 15), sil_coef_digits)
plt.xticks(range(2, 15))
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.grid(True)
plt.show()
Get the value of K for which the Silhouette coefficient is the highest.
# 2 is added since iteration is starting at 2.
print(np.argmax(sil_coef_digits) + 2)
9
Objective¶
In this notebook, we will demonstrate working of MLPClassifier to classify handwritten digits in MNIST dataset.
Importing Libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.datasets import fetch_openml
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedShuffleSplit
import warnings
warnings.filterwarnings('ignore')
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=42)
Loading the dataset¶
Lets use the MNIST dataet for the demo of MLPClassifier.
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.to_numpy()
y = y.to_numpy()
Train test split¶
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print('Shape of training data before flattening : ',X_train.shape)
print('Shape of testing data before flattening : ',X_test.shape)
Shape of training data before flattening : (60000, 784) Shape of testing data before flattening : (10000, 784)
Reshaping¶
X_train = X_train.reshape(X_train.shape[0] ,28*28)
X_test = X_test.reshape(X_test.shape[0] ,28*28)
Normalizing¶
X_train = X_train / 255
X_test = X_test / 255
print('Shape of training data after flattening : ',X_train.shape)
print('Shape of testing data after flattening : ',X_test.shape)
Shape of training data after flattening : (60000, 784) Shape of testing data after flattening : (10000, 784)
print('Shape of training data : ',X_train.shape)
print('Shape of testing data : ', X_test.shape)
print('Shape of training labels : ',y_train.shape)
print('Shape of testing labels :', y_test.shape)
Shape of training data : (60000, 784) Shape of testing data : (10000, 784) Shape of training labels : (60000,) Shape of testing labels : (10000,)
Fit MLPClassifier on MNIST dataset¶
Let us train a MLPClassifier with one hidden layer having 128 neurons.
mlpc = MLPClassifier(hidden_layer_sizes=(128,))
mlpc.fit(X_train, y_train)
MLPClassifier(hidden_layer_sizes=(128,))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MLPClassifier(hidden_layer_sizes=(128,))
cv_score = cross_val_score(mlpc, X_train, y_train.ravel(), cv=cv)
print('Training accuracy : {:.2f} %'.format(cv_score.mean() *100))
Training accuracy : 97.22 %
Prediction probabilities on testing data
mlpc.predict_proba(X_test[:5])
array([[2.37458014e-12, 1.42969538e-15, 7.42782257e-09, 2.07770573e-06,
1.36965860e-21, 7.15259561e-15, 3.17157909e-21, 9.99997914e-01,
1.79333277e-12, 4.13385362e-10],
[4.34717050e-16, 2.60988954e-11, 1.00000000e+00, 4.20811110e-10,
1.17613965e-30, 1.21997285e-14, 1.54206455e-14, 5.04768969e-31,
4.16630006e-13, 6.09012814e-26],
[2.92345224e-11, 9.99996191e-01, 2.87893448e-08, 2.58269590e-11,
1.61243656e-08, 2.96121060e-09, 1.93673881e-09, 2.76282333e-08,
3.73169140e-06, 7.71828261e-12],
[9.99999961e-01, 2.08845641e-18, 1.88509402e-08, 3.71971203e-13,
1.06326320e-13, 1.39581817e-12, 1.24417890e-10, 1.97907116e-08,
3.48106592e-15, 4.12848473e-11],
[9.00666687e-15, 1.34092540e-16, 1.30169831e-10, 2.31364539e-18,
9.99949124e-01, 2.66617628e-16, 6.02472590e-13, 4.44053394e-11,
2.62229605e-14, 5.08762712e-05]])
Prediction of class labels of testing data
y_pred = mlpc.predict(X_test)
print('Training accuracy : {:.2f}'.format(accuracy_score(y_train, mlpc.predict(X_train)) *100))
print('Testing accuracy : {:.2f}'.format(accuracy_score(y_test, y_pred) *100))
Training accuracy : 100.00 Testing accuracy : 98.01
Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, fmt='.4g', cmap='Reds')
plt.show()
Classification Report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.98 0.99 0.99 980
1 0.99 0.99 0.99 1135
2 0.98 0.98 0.98 1032
3 0.97 0.98 0.98 1010
4 0.98 0.98 0.98 982
5 0.99 0.98 0.98 892
6 0.99 0.99 0.99 958
7 0.98 0.97 0.98 1028
8 0.97 0.97 0.97 974
9 0.97 0.98 0.97 1009
accuracy 0.98 10000
macro avg 0.98 0.98 0.98 10000
weighted avg 0.98 0.98 0.98 10000
Plot of test data along with predicted class labels¶
fig = plt.figure(figsize=(10,8))
for i in range(25):
ax = fig.add_subplot(5, 5, i+1)
ax.imshow(X_test[i].reshape(28,28), cmap=plt.get_cmap('gray'))
ax.set_title('Label (y): {y}'.format(y=y_pred[i]))
plt.axis('off')
Visualization of MLP weights in hidden layer¶
Looking at the learned coefficients of a neural network can provide insiht into the learning behaviour.
The input data comtains 784 features in the dataset.
We have used one hidden layer with 128 neurons. Therefore, weight matrix has the shape (784, 128).
We can therefore visualize a single column of the weight matrix as a 28x28 pixel image.
w = mlpc.coefs_
w = np.array(w[0])
w.shape
(784, 128)
w1 = np.array(w[:,0])
w1.shape
(784,)
w_matrix = w1.reshape(28,28)
fig = plt.figure()
plt.imshow(w_matrix, cmap='gray')
plt.grid(False)
plt.axis(False)
plt.colorbar()
plt.show()
fig, axes = plt.subplots(4,4)
vmin, vmax = mlpc.coefs_[0].min(), mlpc.coefs_[0].max()
for coef, ax in zip(mlpc.coefs_[0].T, axes.ravel()):
ax.matshow(coef.reshape(28,28), cmap=plt.cm.gray, vmin=0.5 * vmin, vmax=0.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
plt.show()
Loss Curve¶
plt.plot(mlpc.loss_curve_)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss curve')
plt.show()
MLP Regressor¶
MLPRegressor implements a multi-layer perceptron (MLP) that trains using backpropagation with no activation function in the output layer.
Therefore, it uses the square error as the loss function, and the output is a set of continuous values.
Importing Libraries¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split ,GridSearchCV ,RandomizedSearchCV, cross_validate, ShuffleSplit
import warnings
warnings.filterwarnings('ignore')
np.random.seed(306)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
Loading the dataset¶
This dataset can be fetched from sklearn with fetch_california_housing API.
from sklearn.datasets import fetch_california_housing
X,y = fetch_california_housing(return_X_y=True)
print('Shape of feature matrix : ' ,X.shape)
print('Shape of label vector : ',y.shape)
Shape of feature matrix : (20640, 8) Shape of label vector : (20640,)
Split data into train & test sets¶
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
print('Shape of training feature matrix : ' ,X_train.shape)
print('Shape of training label vector : ',y_train.shape)
print()
print('Shape of test feature matrix : ' ,X_test.shape)
print('Shape of test label vector : ',y_test.shape)
Shape of training feature matrix : (16512, 8) Shape of training label vector : (16512,) Shape of test feature matrix : (4128, 8) Shape of test label vector : (4128,)
Fit a pipeline to implement MLPRegressor¶
Let's train a MLPRegressor with 3 hidden layers having 128 neurons each.
pipe = Pipeline([('scaler', StandardScaler()),
('regressor', MLPRegressor(hidden_layer_sizes=(32)))])
cv_results = cross_validate(pipe,
X_train,
y_train,
cv=cv,
scoring="neg_mean_absolute_percentage_error",
return_train_score=True,
return_estimator=True)
mlp_train_error = -1 * cv_results['train_score']
mlp_test_error = -1 * cv_results['test_score']
print(
f"Mean absolute error of MLP regressor model on the train set :\n" f"{mlp_train_error.mean():.3f} +/- {mlp_train_error.std():.3f}")
print()
print(
f"Mean absolute error of MLP regressor model on the test set :\n" f"{mlp_test_error.mean():.3f} +/- {mlp_test_error.std():.3f}")
Mean absolute error of MLP regressor model on the train set : 0.212 +/- 0.004 Mean absolute error of MLP regressor model on the test set : 0.213 +/- 0.007
pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()),
('regressor', MLPRegressor(hidden_layer_sizes=32))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
('regressor', MLPRegressor(hidden_layer_sizes=32))])StandardScaler()
MLPRegressor(hidden_layer_sizes=32)
mean_absolute_percentage_error(y_train, pipe.predict(X_train))
0.2134059102590712
mean_absolute_percentage_error(y_test, pipe.predict(X_test))
0.22395117090285324
Plotting Predicitons¶
plt.figure(figsize=(8,8))
plt.plot(y_test, pipe.predict(X_test), 'b.')
plt.plot(y_test, y_test ,'g-')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()